merge

2025-09-10 08:13:16 +08:00 · 2017-02-21 17:04:10 +01:00 · 2017-02-21 17:04:10 +01:00 · b0f55ef85a
commit b0f55ef85a
parent d29e9d7119 76687f385c
87 changed files with 3722 additions and 955 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -433,6 +433,7 @@ using std::ptrdiff_t;
 #include "src/Core/util/IndexedViewHelper.h"
 #include "src/Core/util/ReshapedHelper.h"
 #include "src/Core/ArithmeticSequence.h"
+#include "src/Core/IO.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@ -482,7 +483,6 @@ using std::ptrdiff_t;
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/GeneralProduct.h"
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@ -59,4 +59,3 @@

 #endif // EIGEN_GEOMETRY_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
-
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@ -25,7 +25,9 @@

 #include "SparseCore"
 #include "OrderingMethods"
+#ifndef EIGEN_MPL2_ONLY
 #include "SparseCholesky"
+#endif
 #include "SparseLU"
 #include "SparseQR"
 #include "IterativeLinearSolvers"
--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

--- a/Eigen/StdList
+++ b/Eigen/StdList
@ -13,7 +13,7 @@
 #include "Core"
 #include <list>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>

-#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */

 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@ -14,7 +14,7 @@ namespace Eigen {

 namespace internal {

-#if !EIGEN_HAS_CXX11
+#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
 template<typename T> struct aseq_negate {};

 template<> struct aseq_negate<Index> {
@ -138,7 +138,7 @@ protected:

 public:

-#if EIGEN_HAS_CXX11
+#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
  auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) {
    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
  }
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -515,7 +515,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::Scalar Scalar;
    typedef typename Kernel::PacketType PacketType;
@ -563,7 +563,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -463,7 +463,17 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    void visit(Visitor& func) const;

-    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
+    /** \returns a WithFormat proxy object allowing to print a matrix the with given
+      * format \a fmt.
+      *
+      * See class IOFormat for some examples.
+      *
+      * \sa class IOFormat, class WithFormat
+      */
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const
+    {
+      return WithFormat<Derived>(derived(), fmt);
+    }

    /** \returns the unique coefficient of a 1x1 expression */
    EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@ -13,9 +13,9 @@
 #define EIGEN_MATRIXSTORAGE_H

 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif

 namespace Eigen {
@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
    internal::plain_array<T,Size,_Options> m_data;
  public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
    EIGEN_DEVICE_FUNC
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
    EIGEN_DEVICE_FUNC 
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
    EIGEN_DEVICE_FUNC 
    DenseStorage& operator=(const DenseStorage& other)
    { 
@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
      return *this; 
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
      EIGEN_UNUSED_VARIABLE(size);
      EIGEN_UNUSED_VARIABLE(rows);
@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
    }
    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      , m_rows(other.m_rows)
      , m_cols(other.m_cols)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@ -403,7 +408,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_rows = rows;
      m_cols = cols;
@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
      EIGEN_UNUSED_VARIABLE(rows);
    }
@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
      , m_cols(other.m_cols)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@ -477,7 +483,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_cols = cols;
    }
@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
    {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
      EIGEN_UNUSED_VARIABLE(cols);
    }
@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
      , m_rows(other.m_rows)
    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
    }
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@ -550,7 +557,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_rows = rows;
    }
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@ -109,20 +109,6 @@ class WithFormat
    IOFormat m_format;
 };

-/** \returns a WithFormat proxy object allowing to print a matrix the with given
-  * format \a fmt.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa class IOFormat, class WithFormat
-  */
-template<typename Derived>
-inline const WithFormat<Derived>
-DenseBase<Derived>::format(const IOFormat& fmt) const
-{
-  return WithFormat<Derived>(derived(), fmt);
-}
-
 namespace internal {

 // NOTE: This helper is kept for backward compatibility with previous code specializing
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@ -19,8 +19,8 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> >
 : traits<XprType>
 {
  enum {
-    RowsAtCompileTime = array_size<RowIndices>::value,
-    ColsAtCompileTime = array_size<ColIndices>::value,
+    RowsAtCompileTime = int(array_size<RowIndices>::value),
+    ColsAtCompileTime = int(array_size<ColIndices>::value),
    MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : int(traits<XprType>::MaxRowsAtCompileTime),
    MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : int(traits<XprType>::MaxColsAtCompileTime),

@ -29,8 +29,8 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> >
               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
               : XprTypeIsRowMajor,

-    RowIncr = get_compile_time_incr<RowIndices>::value,
-    ColIncr = get_compile_time_incr<ColIndices>::value,
+    RowIncr = int(get_compile_time_incr<RowIndices>::value),
+    ColIncr = int(get_compile_time_incr<ColIndices>::value),
    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
    OuterIncr = IsRowMajor ? RowIncr : ColIncr,

@ -51,7 +51,7 @@ struct traits<IndexedView<XprType, RowIndices, ColIndices> >

    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
    // but this is too strict regarding negative strides...
-    DirectAccessMask = (InnerIncr!=UndefinedIncr && OuterIncr!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
+    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@ -812,6 +812,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      this->_set_noalias(other);
    }

+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
    // Initialize an arbitrary matrix from a generic Eigen expression
    template<typename T, typename OtherDerived>
    EIGEN_DEVICE_FUNC
@ -834,7 +841,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      this->derived() = r;
    }
    
-    // For fixed -size arrays:
+    // For fixed-size Array<Scalar,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
@ -846,6 +853,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      Base::setConstant(val0);
    }
    
+    // For fixed-size Array<Index,...>
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Index& val0,
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -46,7 +46,7 @@ typedef uint32x4_t  Packet4ui;
  const Packet4f p4f_##NAME = pset1<Packet4f>(X)

 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))

 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@ -83,7 +83,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasSqrt = 0
  };
 };
-template<> struct packet_traits<int>    : default_packet_traits
+template<> struct packet_traits<int32_t>    : default_packet_traits
 {
  typedef Packet4i type;
  typedef Packet4i half; // Packet2i intrinsics not implemented yet
@ -105,11 +105,11 @@ EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif

-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }

 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
@ -117,7 +117,7 @@ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
  Packet4f countdown = vld1q_f32(f);
  return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
 {
  const int32_t i[] = {0, 1, 2, 3};
  Packet4i countdown = vld1q_s32(i);
@ -240,20 +240,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }

-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }

-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }

-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
  float32x2_t lo, hi;
  lo = vld1_dup_f32(from);
  hi = vld1_dup_f32(from+1);
  return vcombine_f32(lo, hi);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
 {
  int32x2_t lo, hi;
  lo = vld1_dup_s32(from);
@ -261,11 +261,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
  return vcombine_s32(lo, hi);
 }

-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }

-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }

 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
@ -276,7 +276,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
  res = vsetq_lane_f32(from[3*stride], res, 3);
  return res;
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
 {
  Packet4i res = pset1<Packet4i>(0);
  res = vsetq_lane_s32(from[0*stride], res, 0);
@ -293,7 +293,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
  to[stride*2] = vgetq_lane_f32(from, 2);
  to[stride*3] = vgetq_lane_f32(from, 3);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
 {
  to[stride*0] = vgetq_lane_s32(from, 0);
  to[stride*1] = vgetq_lane_s32(from, 1);
@ -301,12 +301,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[stride*3] = vgetq_lane_s32(from, 3);
 }

-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }

 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }

 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
  float32x2_t a_lo, a_hi;
@ -361,7 +361,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
  return sum;
 }

-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, sum;

@ -408,7 +408,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)

  return vget_lane_f32(prod, 0);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, prod;

@ -436,7 +436,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
  return vget_lane_f32(min, 0);
 }

-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, min;

@ -461,7 +461,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
  return vget_lane_f32(max, 0);
 }

-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
 {
  int32x2_t a_lo, a_hi, max;

--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@ -148,7 +148,7 @@ struct tribb_kernel
    ResMapper res(_res, resStride);
    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;

-    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));

    // let's process the block per panel of actual_mc x BlockSize,
    // again, each is split into three parts, etc.
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@ -104,13 +104,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
  // - the sizes are large enough

  // compute the maximal number of threads from the size of the product:
-  // FIXME this has to be fine tuned
+  // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
  Index size = transpose ? rows : cols;
-  Index pb_max_threads = std::max<Index>(1,size / 32);
+  Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
+
  // compute the maximal number of threads from the total amount of work:
  double work = static_cast<double>(rows) * static_cast<double>(cols) *
      static_cast<double>(depth);
-  double kMinTaskSize = 50000;  // Heuristic.
+  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));

  // compute the number of threads we are going to use
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@ -137,7 +137,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());

-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
    triangularBuffer.setZero();
    if((Mode&ZeroDiag)==ZeroDiag)
      triangularBuffer.diagonal().setZero();
@ -284,7 +284,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());

-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
    triangularBuffer.setZero();
    if((Mode&ZeroDiag)==ZeroDiag)
      triangularBuffer.diagonal().setZero();
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -23,7 +23,7 @@

 /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_COMP_GNUC 1
+  #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
 #else
  #define EIGEN_COMP_GNUC 0
 #endif
@ -80,8 +80,8 @@
 //  2015   14      1900
 //  "15"   15      1900

-/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
-#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
+/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
+#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
 #else
  #define EIGEN_COMP_MSVC_STRICT 0
@ -349,6 +349,14 @@
 # define __has_feature(x) 0
 #endif

+// Some old compilers do not support template specializations like:
+// template<typename T,int N> void foo(const T x[N]);
+#if !( EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || defined(__apple_build_version__)) || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49)
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1
+#else
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
+#endif
+
 // Upperbound on the C++ version to use.
 // Expected values are 03, 11, 14, 17, etc.
 // By default, let's use an arbitrarily large C++ version.
@ -829,7 +837,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY

-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 ||  __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 ||  defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
    using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -97,17 +97,22 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };

-template<typename T> struct is_integral        { enum { value = false }; };
-template<> struct is_integral<bool>            { enum { value = true }; };
-template<> struct is_integral<char>            { enum { value = true }; };
-template<> struct is_integral<signed char>     { enum { value = true }; };
-template<> struct is_integral<unsigned char>   { enum { value = true }; };
-template<> struct is_integral<signed short>    { enum { value = true }; };
-template<> struct is_integral<unsigned short>  { enum { value = true }; };
-template<> struct is_integral<signed int>      { enum { value = true }; };
-template<> struct is_integral<unsigned int>    { enum { value = true }; };
-template<> struct is_integral<signed long>     { enum { value = true }; };
-template<> struct is_integral<unsigned long>   { enum { value = true }; };
+#if EIGEN_HAS_CXX11
+using std::is_integral;
+#else
+template<typename T> struct is_integral               { enum { value = false }; };
+template<> struct is_integral<bool>                   { enum { value = true }; };
+template<> struct is_integral<char>                   { enum { value = true }; };
+template<> struct is_integral<signed char>            { enum { value = true }; };
+template<> struct is_integral<unsigned char>          { enum { value = true }; };
+template<> struct is_integral<signed short>           { enum { value = true }; };
+template<> struct is_integral<unsigned short>         { enum { value = true }; };
+template<> struct is_integral<signed int>             { enum { value = true }; };
+template<> struct is_integral<unsigned int>           { enum { value = true }; };
+template<> struct is_integral<signed long>            { enum { value = true }; };
+template<> struct is_integral<unsigned long>          { enum { value = true }; };
+#endif
+

 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@ -250,7 +250,7 @@ template<typename _MatrixType> class ComplexEigenSolver
    EigenvectorType m_matX;

  private:
-    void doComputeEigenvectors(const RealScalar& matrixnorm);
+    void doComputeEigenvectors(RealScalar matrixnorm);
    void sortEigenvalues(bool computeEigenvectors);
 };

@ -284,10 +284,12 @@ ComplexEigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool


 template<typename MatrixType>
-void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(const RealScalar& matrixnorm)
+void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm)
 {
  const Index n = m_eivalues.size();

+  matrixnorm = numext::maxi(matrixnorm,(std::numeric_limits<RealScalar>::min)());
+
  // Compute X such that T = X D X^(-1), where D is the diagonal of T.
  // The matrix X is unit triangular.
  m_matX = EigenvectorType::Zero(n, n);
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@ -248,12 +248,24 @@ template<typename MatrixType>
 template<typename InputType>
 RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
+  const Scalar considerAsZero = (std::numeric_limits<Scalar>::min)();
+
  eigen_assert(matrix.cols() == matrix.rows());
  Index maxIters = m_maxIters;
  if (maxIters == -1)
    maxIters = m_maxIterationsPerRow * matrix.rows();

  Scalar scale = matrix.derived().cwiseAbs().maxCoeff();
+  if(scale<considerAsZero)
+  {
+    m_matT.setZero(matrix.rows(),matrix.cols());
+    if(computeU)
+      m_matU.setIdentity(matrix.rows(),matrix.cols());
+    m_info = Success;
+    m_isInitialized = true;
+    m_matUisUptodate = computeU;
+    return *this;
+  }

  // Step 1. Reduce to Hessenberg form
  m_hess.compute(matrix.derived()/scale);
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@ -414,7 +414,8 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>

  if(n==1)
  {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.diagonal()[0]);
+    m_eivec = matrix;
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0));
    if(computeEigenvectors)
      m_eivec.setOnes(n,n);
    m_info = Success;
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@ -87,7 +87,8 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
  const TriangularView<const VectorsType, UnitLower> V(vectors);

  // A -= V T V^* A
-  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
+  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,
+         (VectorsType::MaxColsAtCompileTime==1 && MatrixType::MaxColsAtCompileTime!=1)?RowMajor:ColMajor,
         VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
  // FIXME add .noalias() once the triangular product can work inplace
  if(forward) tmp = T.template triangularView<Upper>()           * tmp;
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -112,9 +112,11 @@ public:
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
+              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
+              : MatrixType::Options
  };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
          TransposeTypeWithSameStorageOrder;

  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
@ -200,10 +202,12 @@ public:
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
+              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
+              : MatrixType::Options
  };

-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
          TransposeTypeWithSameStorageOrder;

  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@ -336,7 +336,7 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator
      {
        do {
          ++m_cachedIndex;
-        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<m_epsilon);
+        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<=m_epsilon);
        if (m_cachedIndex<m_vector.m_end)
          m_cachedValue = m_vector.m_buffer[m_cachedIndex];
        else
@ -347,7 +347,7 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator
        ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
        do {
          m_currentEl = llElements[m_currentEl].next;
-        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<m_epsilon);
+        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<=m_epsilon);
        if (m_currentEl<0)
        {
          m_cachedIndex = -1;
@ -363,9 +363,9 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator

  protected:
    const AmbiVector& m_vector; // the target vector
-    StorageIndex m_currentEl;            // the current element in sparse/linked-list mode
+    StorageIndex m_currentEl;   // the current element in sparse/linked-list mode
    RealScalar m_epsilon;       // epsilon used to prune zero coefficients
-    StorageIndex m_cachedIndex;          // current coordinate
+    StorageIndex m_cachedIndex; // current coordinate
    Scalar m_cachedValue;       // current value
    bool m_isDense;             // mode of the vector
 };
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@ -357,6 +357,16 @@ struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, Itera
  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
 };

+// "sparse ./ dense"
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<scalar_quotient_op<T1,T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
 // "sparse && sparse"
 template<typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op, Lhs, Rhs>, IteratorBased, IteratorBased>
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@ -78,8 +78,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
 /// \sa class Block, fix, fix<N>(int)
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -92,8 +92,8 @@ block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols)
 }

 /// This is the const version of block(Index,Index,NRowsType,NColsType)
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -124,8 +124,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -138,8 +138,8 @@ topRightCorner(NRowsType cRows, NColsType cCols)
 }

 /// This is the const version of topRightCorner(NRowsType, NColsType).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -229,8 +229,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -243,8 +243,8 @@ topLeftCorner(NRowsType cRows, NColsType cCols)
 }

 /// This is the const version of topLeftCorner(Index, Index).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -333,8 +333,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -348,8 +348,8 @@ bottomRightCorner(NRowsType cRows, NColsType cCols)
 }

 /// This is the const version of bottomRightCorner(NRowsType, NColsType).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -439,8 +439,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -454,8 +454,8 @@ bottomLeftCorner(NRowsType cRows, NColsType cCols)
 }

 /// This is the const version of bottomLeftCorner(NRowsType, NColsType).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -544,8 +544,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@ -558,8 +558,8 @@ topRows(NRowsType n)
 }

 /// This is the const version of topRows(NRowsType).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@ -619,8 +619,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@ -633,8 +633,8 @@ bottomRows(NRowsType n)
 }

 /// This is the const version of bottomRows(NRowsType).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@ -695,8 +695,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NRowsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@ -709,8 +709,8 @@ middleRows(Index startRow, NRowsType n)
 }

 /// This is the const version of middleRows(Index,NRowsType).
-EIGEN_DEVICE_FUNC
 template<typename NRowsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
 #else
@ -771,8 +771,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -785,8 +785,8 @@ leftCols(NColsType n)
 }

 /// This is the const version of leftCols(NColsType).
-EIGEN_DEVICE_FUNC
 template<typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -846,8 +846,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -860,8 +860,8 @@ rightCols(NColsType n)
 }

 /// This is the const version of rightCols(NColsType).
-EIGEN_DEVICE_FUNC
 template<typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -922,8 +922,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -936,8 +936,8 @@ middleCols(Index startCol, NColsType numCols)
 }

 /// This is the const version of middleCols(Index,NColsType).
-EIGEN_DEVICE_FUNC
 template<typename NColsType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
 #else
@ -1130,8 +1130,8 @@ inline ConstRowXpr row(Index i) const
 ///
 /// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
 ///
-EIGEN_DEVICE_FUNC
 template<typename NType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@ -1146,8 +1146,8 @@ segment(Index start, NType n)


 /// This is the const version of segment(Index,NType).
-EIGEN_DEVICE_FUNC
 template<typename NType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@ -1180,8 +1180,8 @@ segment(Index start, NType n) const
 ///
 /// \sa class Block, block(Index,Index)
 ///
-EIGEN_DEVICE_FUNC
 template<typename NType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@ -1195,8 +1195,8 @@ head(NType n)
 }

 /// This is the const version of head(NType).
-EIGEN_DEVICE_FUNC
 template<typename NType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@ -1229,8 +1229,8 @@ head(NType n) const
 ///
 /// \sa class Block, block(Index,Index)
 ///
-EIGEN_DEVICE_FUNC
 template<typename NType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
@ -1244,8 +1244,8 @@ tail(NType n)
 }

 /// This is the const version of tail(Index).
-EIGEN_DEVICE_FUNC
 template<typename NType>
+EIGEN_DEVICE_FUNC
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 inline const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
 #else
--- a/Eigen/src/plugins/IndexedViewMethods.h
+++ b/Eigen/src/plugins/IndexedViewMethods.h
@ -7,7 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-#ifndef EIGEN_PARSED_BY_DOXYGEN
+#if !defined(EIGEN_PARSED_BY_DOXYGEN)

 // This file is automatically included twice to generate const and non-const versions

@ -112,6 +112,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
  return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols()));
 }

+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
 // The folowing three overloads are needed to handle raw Index[N] arrays.

 template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
@ -138,6 +140,8 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col
                    (derived(), rowIndices, colIndices);
 }

+#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
 // Overloads for 1D vectors/arrays

 template<typename Indices>
@ -181,6 +185,8 @@ operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST
  return Base::operator()(internal::eval_expr_given_size(id,size()));
 }

+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
 template<typename IndicesT, std::size_t IndicesN>
 typename internal::enable_if<IsRowMajor,
  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]> >::type
@ -201,6 +207,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
            (derived(), indices, IvcIndex(0));
 }

+#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
 #undef EIGEN_INDEXED_VIEW_METHOD_CONST
 #undef EIGEN_INDEXED_VIEW_METHOD_TYPE

@ -256,5 +264,4 @@ template<typename Indices>
 IndexedView_or_VectorBlock
 operator()(const Indices& indices);

-#endif // EIGEN_PARSED_BY_DOXYGEN
-
+#endif  // EIGEN_PARSED_BY_DOXYGEN
--- a/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@ -5,29 +5,12 @@

 #include "tensor_benchmarks.h"

-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-// Simple functions
-template <typename device_selector>
-cl::sycl::queue sycl_queue() {
-  return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) {
-    for (const auto& e : l) {
-      try {
-        std::rethrow_exception(e);
-      } catch (cl::sycl::exception e) {
-        std::cout << e.what() << std::endl;
-      }
-    }
-  });
-}
-
 #define BM_FuncGPU(FUNC)                                       \
  static void BM_##FUNC(int iters, int N) {                    \
    StopBenchmarkTiming();                                     \
-    cl::sycl::queue q = sycl_queue<cl::sycl::gpu_selector>();  \
-    Eigen::SyclDevice device(q);                               \
+    cl::sycl::gpu_selector selector; \
+    Eigen::QueueInterface queue(selector);     \
+    Eigen::SyclDevice device(&queue);                          \
    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
    suite.FUNC(iters);                                         \
  }                                                            \
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@ -138,7 +138,7 @@ else()
  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
 endif()

-set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1)
+set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1)

 # Check if the platform is supported
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -151,6 +151,7 @@ ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
 ei_add_test(unalignedassert)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
+ei_add_test(constructor)
 ei_add_test(linearstructure)
 ei_add_test(integer_types)
 ei_add_test(unalignedcount)
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@ -49,6 +49,22 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
  v1[r] = x;
  VERIFY_IS_APPROX(x, v1[r]);

+  // test fetching with various index types.
+  Index r1 = internal::random<Index>(0, numext::mini(Index(127),rows-1));
+  x = v1(static_cast<char>(r1));
+  x = v1(static_cast<signed char>(r1));
+  x = v1(static_cast<unsigned char>(r1));
+  x = v1(static_cast<signed short>(r1));
+  x = v1(static_cast<unsigned short>(r1));
+  x = v1(static_cast<signed int>(r1));
+  x = v1(static_cast<unsigned int>(r1));
+  x = v1(static_cast<signed long>(r1));
+  x = v1(static_cast<unsigned long>(r1));
+#if EIGEN_HAS_CXX11
+  x = v1(static_cast<long long int>(r1));
+  x = v1(static_cast<unsigned long long int>(r1));
+#endif
+
  VERIFY_IS_APPROX(               v1,    v1);
  VERIFY_IS_NOT_APPROX(           v1,    2*v1);
  VERIFY_IS_MUCH_SMALLER_THAN(    vzero, v1);
--- a/test/constructor.cpp
+++ b/test/constructor.cpp
@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+template<typename MatrixType> struct Wrapper
+{
+  MatrixType m_mat;
+  inline Wrapper(const MatrixType &x) : m_mat(x) {}
+  inline operator const MatrixType& () const { return m_mat; }
+  inline operator MatrixType& () { return m_mat; }
+};
+
+template<typename MatrixType> void ctor_init1(const MatrixType& m)
+{
+  // Check logic in PlainObjectBase::_init1
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  MatrixType m0 = MatrixType::Random(rows,cols);
+
+  VERIFY_EVALUATION_COUNT( MatrixType m1(m0), 1);
+  VERIFY_EVALUATION_COUNT( MatrixType m2(m0+m0), 1);
+  VERIFY_EVALUATION_COUNT( MatrixType m2(m0.block(0,0,rows,cols)) , 1);
+
+  Wrapper<MatrixType> wrapper(m0);
+  VERIFY_EVALUATION_COUNT( MatrixType m3(wrapper) , 1);
+}
+
+
+void test_constructor()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( ctor_init1(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( ctor_init1(Matrix4d()) );
+    CALL_SUBTEST_1( ctor_init1(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_1( ctor_init1(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+  {
+    Matrix<Index,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Matrix<Index,1,1> a(123.0);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Matrix<float,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123.f);
+  }
+  {
+    Array<Index,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Array<Index,1,1> a(123.0);
+    VERIFY_IS_EQUAL(a[0], 123);
+  }
+  {
+    Array<float,1,1> a(123);
+    VERIFY_IS_EQUAL(a[0], 123.f);
+  }
+  {
+    Array<Index,3,3> a(123);
+    VERIFY_IS_EQUAL(a(4), 123);
+  }
+  {
+    Array<Index,3,3> a(123.0);
+    VERIFY_IS_EQUAL(a(4), 123);
+  }
+  {
+    Array<float,3,3> a(123);
+    VERIFY_IS_EQUAL(a(4), 123.f);
+  }
+}
--- a/test/eigensolver_complex.cpp
+++ b/test/eigensolver_complex.cpp
@ -131,6 +131,15 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
    ComplexEigenSolver<MatrixType> eig(a.adjoint() * a);
    eig.compute(a.adjoint() * a);
  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    ComplexEigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
 }

 template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp
@ -76,6 +76,15 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
    EigenSolver<MatrixType> eig(a.adjoint() * a);
    eig.compute(a.adjoint() * a);
  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    EigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
 }

 template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@ -180,6 +180,15 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
    SelfAdjointEigenSolver<MatrixType> eig(a.adjoint() * a);
    eig.compute(a.adjoint() * a);
  }
+
+  // regression test for bug 478
+  {
+    a.setZero();
+    SelfAdjointEigenSolver<MatrixType> ei3(a);
+    VERIFY_IS_EQUAL(ei3.info(), Success);
+    VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1));
+    VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity());
+  }
 }

 template<int>
--- a/test/indexed_view.cpp
+++ b/test/indexed_view.cpp
@ -297,7 +297,7 @@ void check_indexed_view()

  VERIFY_IS_APPROX( (A(std::array<int,3>{{1,3,5}}, std::array<int,4>{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) );

-#if (!EIGEN_COMP_CLANG) || (EIGEN_COMP_CLANG>=308 && !defined(__apple_build_version__))
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
  VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array<int,4>{{3, 1, 6, 5}}, all) );
  VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array<int,4>{{3, 1, 6, 5}}) );
  VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array<int,3>{{1,3,5}},std::array<int,4>{{3, 1, 6, 5}}) );
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@ -101,6 +101,12 @@ void test_jacobisvd()
    // Test on inf/nan matrix
    CALL_SUBTEST_7(  (svd_inf_nan<JacobiSVD<MatrixXf>, MatrixXf>()) );
    CALL_SUBTEST_10( (svd_inf_nan<JacobiSVD<MatrixXd>, MatrixXd>()) );
+
+    // bug1395 test compile-time vectors as input
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,6,1>()) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,6>()) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,Dynamic,1>(r)) ));
+    CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix<double,1,Dynamic>(c)) ));
  }

  CALL_SUBTEST_7(( jacobisvd<MatrixXf>(MatrixXf(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
--- a/test/main.h
+++ b/test/main.h
@ -41,6 +41,7 @@
 #include <complex>
 #include <deque>
 #include <queue>
+#include <cassert>
 #include <list>
 #if __cplusplus >= 201103L
 #include <random>
@ -79,10 +80,12 @@
 #ifdef TEST_ENABLE_TEMPORARY_TRACKING

 static long int nb_temporaries;
+static long int nb_temporaries_on_assert = -1;

 inline void on_temporary_creation(long int size) {
  // here's a great place to set a breakpoint when debugging failures in this test!
  if(size!=0) nb_temporaries++;
+  if(nb_temporaries_on_assert>0) assert(nb_temporaries<nb_temporaries_on_assert);
 }

 #define EIGEN_DENSE_STORAGE_CTOR_PLUGIN { on_temporary_creation(size); }
--- a/test/mpl2only.cpp
+++ b/test/mpl2only.cpp
@ -12,7 +12,9 @@
 #include <Eigen/SparseCore>
 #include <Eigen/SparseLU>
 #include <Eigen/SparseQR>
+#include <Eigen/Sparse>
 #include <Eigen/IterativeLinearSolvers>
+#include <Eigen/Eigen>

 int main()
 {
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@ -37,8 +37,7 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
  RightPermutationType rp(rv);
  MatrixType m_permuted = MatrixType::Random(rows,cols);
  
-  const int one_if_dynamic = MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0;
-  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, one_if_dynamic); // 1 temp for sub expression "lp * m_original"
+  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original"

  for (int i=0; i<rows; i++)
    for (int j=0; j<cols; j++)
@ -50,7 +49,7 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
  VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
  
  m_permuted = m_original;
-  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, one_if_dynamic);
+  VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, 1);
  VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
  
  VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original);
@ -75,19 +74,19 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
  
  // check inplace permutations
  m_permuted = m_original;
-  VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, one_if_dynamic); // 1 temp to allocate the mask
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, 1); // 1 temp to allocate the mask
  VERIFY_IS_APPROX(m_permuted, lp.inverse()*m_original);
  
  m_permuted = m_original;
-  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), one_if_dynamic); // 1 temp to allocate the mask
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), 1); // 1 temp to allocate the mask
  VERIFY_IS_APPROX(m_permuted, m_original*rp.inverse());
  
  m_permuted = m_original;
-  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, one_if_dynamic); // 1 temp to allocate the mask
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, 1); // 1 temp to allocate the mask
  VERIFY_IS_APPROX(m_permuted, lp*m_original);
  
  m_permuted = m_original;
-  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, one_if_dynamic); // 1 temp to allocate the mask
+  VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, 1); // 1 temp to allocate the mask
  VERIFY_IS_APPROX(m_permuted, m_original*rp);

  if(rows>1 && cols>1)
--- a/test/redux.cpp
+++ b/test/redux.cpp
@ -70,10 +70,10 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
  VERIFY_IS_APPROX(m1.block(r0,c0,0,0).prod(),  Scalar(1));

  // test nesting complex expression
-  VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) );
+  VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1) );
  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows);
  m2.setRandom();
-  VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) );
+  VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(),(MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1));
 }

 template<typename VectorType> void vectorRedux(const VectorType& w)
@ -156,8 +156,10 @@ void test_redux()
    CALL_SUBTEST_1( matrixRedux(Array<float, 1, 1>()) );
    CALL_SUBTEST_2( matrixRedux(Matrix2f()) );
    CALL_SUBTEST_2( matrixRedux(Array2f()) );
+    CALL_SUBTEST_2( matrixRedux(Array22f()) );
    CALL_SUBTEST_3( matrixRedux(Matrix4d()) );
    CALL_SUBTEST_3( matrixRedux(Array4d()) );
+    CALL_SUBTEST_3( matrixRedux(Array44d()) );
    CALL_SUBTEST_4( matrixRedux(MatrixXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
    CALL_SUBTEST_4( matrixRedux(ArrayXXcf(internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
    CALL_SUBTEST_5( matrixRedux(MatrixXd (internal::random<int>(1,maxsize), internal::random<int>(1,maxsize))) );
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@ -161,17 +161,21 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    if(internal::random<bool>())
      m1.makeCompressed();

+    Index m1_nnz = m1.nonZeros();
+
    VERIFY_IS_APPROX(m1*s1, refM1*s1);
    VERIFY_IS_APPROX(m1+m2, refM1+refM2);
    VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3);
    VERIFY_IS_APPROX(m3.cwiseProduct(m1+m2), refM3.cwiseProduct(refM1+refM2));
    VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2);
+    VERIFY_IS_APPROX(m4=m1/s1, refM1/s1);
+    VERIFY_IS_EQUAL(m4.nonZeros(), m1_nnz);

    if(SparseMatrixType::IsRowMajor)
      VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.row(0).dot(refM2.row(0)));
    else
      VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.col(0)), refM1.col(0).dot(refM2.col(0)));
-    
+
    DenseVector rv = DenseVector::Random(m1.cols());
    DenseVector cv = DenseVector::Random(m1.rows());
    Index r = internal::random<Index>(0,m1.rows()-2);
@ -208,8 +212,12 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re

    VERIFY_IS_APPROX(m1.sum(), refM1.sum());

+    m4 = m1; refM4 = m4;
+
    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);

    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
@ -220,13 +228,22 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
      VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) );
      VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) );
      VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) );
+      m1 = m4; refM1 = refM4;
    }

    // test aliasing
    VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
    VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval()));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
    VERIFY_IS_APPROX((m1 = -m1.transpose()), (refM1 = -refM1.transpose().eval()));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;
    VERIFY_IS_APPROX((m1 += -m1), (refM1 += -refM1));
+    VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz);
+    m1 = m4; refM1 = refM4;

    if(m1.isCompressed())
    {
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@ -231,12 +231,12 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::RowsAtCompileTime> m1m1 = m1 * m1.transpose();
  VERIFY_IS_APPROX( (m1 * m1.transpose()).colwise().sum(), m1m1.colwise().sum());
  Matrix<Scalar,1,MatrixType::RowsAtCompileTime> tmp(rows);
-  VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), (MatrixType::RowsAtCompileTime==Dynamic ? 1 : 0));
+  VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), 1);

  m2 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())).eval();
  m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows()));
  VERIFY_IS_APPROX( m1, m2 );
-  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime!=1 ? 1 : 0) );
+  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) );
 }

 void test_vectorwiseop()
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -54,7 +54,7 @@ struct is_input_scalar<Sizes<> > {
  static const bool value = true;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices>
+template <typename std::ptrdiff_t... Indices>
 struct is_input_scalar<Sizes<Indices...> > {
  static const bool value = (Sizes<Indices...>::total_size == 1);
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -150,7 +150,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
  {
    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
    eigen_assert(NumInputDims > m_dim.actualDim());
@ -206,7 +206,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+  (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
      // m_stride is equal to 1, so let's avoid the integer division.
      eigen_assert(m_stride == 1);
      Index inputIndex = index * m_inputStride + m_inputOffset;
@ -218,7 +218,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
      return rslt;
    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
-	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
      // m_stride is aways greater than index, so let's avoid the integer division.
      eigen_assert(m_stride > index);
      return m_impl.template packet<LoadMode>(index + m_inputOffset);
@ -274,17 +274,29 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    }
  }

+  /// used by sycl
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const {
+    return m_dim.actualDim();
+  }
+
+  /// used by sycl
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const {
+    return m_offset;
+  }
+  /// required by sycl in order to extract the accessor
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
    Index inputIndex;
    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+  (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
      // m_stride is equal to 1, so let's avoid the integer division.
      eigen_assert(m_stride == 1);
      inputIndex = index * m_inputStride + m_inputOffset;
    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
-	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
      // m_stride is aways greater than index, so let's avoid the integer division.
      eigen_assert(m_stride > index);
      inputIndex = index + m_inputOffset;
@ -304,6 +316,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  TensorEvaluator<ArgType, Device> m_impl;
  const internal::DimensionId<DimId> m_dim;
  const Device& m_device;
+// required by sycl
+  const DenseIndex m_offset;
+
 };


@ -344,7 +359,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)

    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
-	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+  (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
      // m_stride is equal to 1, so let's avoid the integer division.
      eigen_assert(this->m_stride == 1);
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@ -355,7 +370,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
        inputIndex += this->m_inputStride;
      }
    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
-	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+         (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
      // m_stride is aways greater than index, so let's avoid the integer division.
      eigen_assert(this->m_stride > index);
      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -64,9 +64,9 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index
 template<typename LhsScalar, typename RhsScalar, typename Scalar>
  struct libxsmm_wrapper {
    libxsmm_wrapper() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {}
-    void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {}
-    void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {}
+    libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {}
+    void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {}
+    void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {}
  };

  template<>
@ -220,7 +220,7 @@ struct TensorContractionEvaluatorBase
    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                          op.rhsExpression(), op.lhsExpression()), device),
        m_device(device),
-        m_result(NULL), m_expr_indices(op.indices()) {
+        m_result(NULL) {
    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                        YOU_MADE_A_PROGRAMMING_MISTAKE);
@ -682,7 +682,9 @@ protected:
    }

    m_can_use_xsmm = true;
-    #endif
+#else
+    EIGEN_UNUSED_VARIABLE(eval_op_indices);
+#endif
  }

 #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
@ -842,9 +844,6 @@ protected:
  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
  const Device& m_device;
  Scalar* m_result;
-  /// required for sycl
-  const Indices m_expr_indices;
-
  bool m_can_use_xsmm;
 };

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@ -22,7 +22,7 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
 namespace Eigen {

-template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
+template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
@ -146,9 +146,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT

    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-  LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
-   this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
-   this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
+    LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
+                       this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
+                       this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
  }
  // required by sycl to construct the expr on the device. Returns original left_impl
  const TensorEvaluator<LeftArgType, Device>& left_impl() const {
@ -158,47 +158,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  const TensorEvaluator<RightArgType, Device>& right_impl() const {
    return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
  }
-  // required by sycl to construct the expr on the device
-  const Indices& indices() const {return this->m_expr_indices;}
 };

-/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster.
-template<typename Expr> struct TensorEvaluatorContainer;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{
-  typedef Eigen::DefaultDevice Device;
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType;
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device)
-  : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                        op.lhsExpression(), op.rhsExpression()), device),
-  m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                        op.rhsExpression(), op.lhsExpression()), device){}
-LeftEvaluator  m_leftImpl;
-RightEvaluator m_rightImpl;
-};
-
-
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
+template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename LHSFunctorExpr,  typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
 typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
-int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{
-
-  typedef  typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-
-  FunctorExpr functors;
+typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
+typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
+  typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
+  typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
+  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
+  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
+  LHSFunctorExpr lhs_functors;
+  RHSFunctorExpr rhs_functors;
  LhsLocalAcc localLhs;
  RhsLocalAcc localRhs;
  OutAccessor out_res;
@ -206,119 +177,130 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
  ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
  LeftNocontractT m_i_strides, m_left_nocontract_strides;
  RightNocontractT m_j_strides,  m_right_nocontract_strides;
-  TupleType tuple_of_accessors;
+  LHSTupleType left_tuple_of_accessors;
+  RHSTupleType right_tuple_of_accessors;
+  Device dev;

-  KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
+
+  KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
    Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
    ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
-    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_)
-    :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
+    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
+    :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
    m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
    m_right_contracting_strides(m_right_contracting_strides_),
    m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
    m_j_strides(m_j_strides_),  m_right_nocontract_strides(m_right_nocontract_strides_),
-    tuple_of_accessors(tuple_of_accessors_){}
+    left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}

    void operator()(cl::sycl::nd_item<1> itemID) {
-      typedef  typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-      auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-      auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
-      typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
+      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
+      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
+      auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
+      auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
+      typedef decltype(lhs_dev_expr.expr) LeftArgType;
+      typedef decltype(rhs_dev_expr.expr) RightArgType;
+      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+      typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+      typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
      typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                     typename DevEvaluator::LeftEvaluator, LeftNocontractT,
+                                                      LeftEvaluator, LeftNocontractT,
                                                     ContractT, 1,
                                                     lhs_inner_dim_contiguous,
                                                     false, Unaligned, MakeGlobalPointer> LhsMapper;

      typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                     typename DevEvaluator::RightEvaluator, RightNocontractT,
+                                                     RightEvaluator, RightNocontractT,
                                                     ContractT, 1,
                                                     rhs_inner_dim_contiguous,
                                                     rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
      // initialize data mappers must happen inside the kernel for device eval
-      LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
-      RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
+      LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
+                    lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
+      RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
+                    rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
      auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
      // Matmul Kernel
      // Thread identifiers
-      const int mLocalThreadId = itemID.get_local(0); // Local ID row
-      const int nLocalThreadId = itemID.get_local(1); // Local ID col
-      const int mGroupId = itemID.get_group(0); // Work-group ID row
-      const int nGroupId = itemID.get_group(1); // Work-group ID localCol
-      const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
+      const Index mLocalThreadId = itemID.get_local(0); // Local ID row
+      const Index nLocalThreadId = itemID.get_local(1); // Local ID col
+      const Index mGroupId = itemID.get_group(0); // Work-group ID row
+      const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
+      const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
      // Allocate register space
      float privateLhs;
      float privateRhs[WorkLoadPerThreadN];
      float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
      // Initialise the privateResumulation registers
-      for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+          for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
              privateRes[wLPTM][wLPTN] = 0.0f;
          }
      }

      // Tile Lhs
-      for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-          int
-          localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          int localLhsRow =  localLhsLinearId% TileSizeDimM;
-          int localLhsCol = localLhsLinearId/TileSizeDimM;
+      for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+          Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+          Index localLhsRow =  localLhsLinearId% TileSizeDimM;
+          Index localLhsCol = localLhsLinearId/TileSizeDimM;
          // Load the value (wide vector load)
-          int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
+          Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
          localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
      }
      // Tile Rhs
-      for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-          int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          int localRhsRow =  localRhsLinearId% TileSizeDimN;
-          int localRhsCol = localRhsLinearId/TileSizeDimN;
+      for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+          Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+          Index localRhsRow =  localRhsLinearId% TileSizeDimN;
+          Index localRhsCol = localRhsLinearId/TileSizeDimN;
          // Load the value (wide vector load)
-          int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
+          Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
          localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);

      }
      // Loop over all tiles
-      const int numTiles = roundUpK/TileSizeDimK;
-      int firstHalf=0;
+      const Index numTiles = roundUpK/TileSizeDimK;
+      Index firstHalf=0;
      do {
          // Synchronise
          itemID.barrier(cl::sycl::access::fence_space::local_space);
          // Load the next tile of Lhs and Rhs into local memory
-          int nextHalf = firstHalf + 1;
+          Index nextHalf = firstHalf + 1;
          if (nextHalf < numTiles) {
              // Tile A
-              for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-                  int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  int localLhsRow =  localLhsLinearId% TileSizeDimM;
-                  int localLhsCol = localLhsLinearId/TileSizeDimM;
+              for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+                  Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+                  Index localLhsRow =  localLhsLinearId% TileSizeDimM;
+                  Index localLhsCol = localLhsLinearId/TileSizeDimM;
                  // global K id
-                  int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
+                  Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
                  // Store the loaded value into local memory
                  localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
              }
              // Tile B
-              for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-                  int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  int localRhsRow =  localRhsLinearId% TileSizeDimN;
-                  int localRhsCol = localRhsLinearId/TileSizeDimN;
+              for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+                  Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+                  Index localRhsRow =  localRhsLinearId% TileSizeDimN;
+                  Index localRhsCol = localRhsLinearId/TileSizeDimN;
                  // Load the value (wide vector load)
-                  int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
+                  Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
                  // Store the loaded vector into local memory
                  localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
              }
          }
          // Loop over the values of a single tile
-          for (int k=0; k<TileSizeDimK; k++) {
+          for (Index k=0; k<TileSizeDimK; k++) {
              // Cache the values of localRhs in registers
-              for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                  int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
+              for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                  Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
                  privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
              }
              // Perform the computation
-              for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-                  int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
+              for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+                  Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
                  privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
-                  for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                  for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
                      privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
                  }
              }
@ -327,13 +309,12 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
          firstHalf++;
      } while (firstHalf<numTiles);

-
      // Store the final results in C
-      for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
+      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+          Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
          if (globalRow< M){
-            for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
+            for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
                if(globalCol<N)
                  out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
            }
@ -343,56 +324,73 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
    }

 };
-template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
+template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {

-static const int TileSizeDimM = 32;                                      // Tile size for dimension M
-static const int TileSizeDimN = 32;                                      // Tile size for dimension N
-static const int TileSizeDimK = 16;                                      // Tile size for dimension K
-static const int WorkLoadPerThreadM = 4;                                 // Work load per thread in dimension M
-static const int WorkLoadPerThreadN = 4;                                 // work load per thread in dimension N
-static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
-static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
-static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
-static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression
+static const Index TileSizeDimM = 32ul;                                      // Tile size for dimension M
+static const Index TileSizeDimN = 32ul;                                      // Tile size for dimension N
+static const Index TileSizeDimK = 16ul;                                      // Tile size for dimension K
+static const Index WorkLoadPerThreadM = 4ul;                                 // Work load per thread in dimension M
+static const Index WorkLoadPerThreadN = 4ul;                                 // work load per thread in dimension N
+static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
+static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
+static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
+static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression

 // RoundUp function to make sure that the global threadId is divisable by local threadId
-static int RoundUp(int x, int y) {
+static Index RoundUp(Index x, Index y) {
  return ((((x) + (y) - 1) / (y))*(y));
 }

-template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
+template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
  static void Run(const Self& self, OutScalar* buffer,  Index M, Index N, Index K,
    ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
    LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-    // create a tuple of accessors from Evaluator
+
    typedef typename Self::XprType HostExpr;
-  //  typedef  typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  //  typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
-    auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
-    typedef decltype(functors) FunctorExpr;
+    typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
+    typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
+    typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
+    typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
+    // extract lhs functor list
+    LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+    // extract rhs functor list
+    RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+
    Index roundUpK = RoundUp(K, TileSizeDimK);
    Index roundUpM = RoundUp(M, TileSizeDimM);
    Index roundUpN = RoundUp(N, TileSizeDimN);
+
    self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
-      typedef decltype(tuple_of_accessors) TupleType;
+      /// work-around for gcc bug
+      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
+      /// work-around for gcc bug
+      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
+      // create lhs tuple of accessors
+      LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
+      // create rhs tuple of accessors
+      RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
+
      // Local memory for elements of Lhs
      typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
      LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
      // Local memory for elements of Rhs
      typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
      RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
+
+      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
      //OutScalar memory
-      auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-      typedef decltype(out_res) OutAccessor;
+      OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
+
      // sycl parallel for
      cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
      cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
-       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar,  FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
+       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
       RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
-       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors,
+       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
          localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
-          m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors));
+          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
    });
    self.device().asynchronousExec();
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -246,6 +246,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>

  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }

+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
  protected:
  template <int LoadMode, bool ActuallyVectorize>
  struct PacketConv {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -100,7 +100,7 @@ class IndexMapper {
      }
    } else {
      for (int i = NumDims - 1; i >= 0; --i) {
-        if (i + 1 < offset) {
+        if (static_cast<size_t>(i + 1) < offset) {
          m_cudaInputStrides[i] =
              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
          m_cudaOutputStrides[i] =
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -0,0 +1,476 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel1D{
+typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+Kernel_accessor kernel_filter;
+const size_t kernelSize, range_x, range_y;
+Buffer_accessor buffer_acc;
+Local_accessor local_acc;
+FunctorExpr functors;
+TupleType tuple_of_accessors;
+EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+  Kernel_accessor kernel_filter_,  const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
+  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
+  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+
+  void operator()(cl::sycl::nd_item<2> itemID) {
+    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+
+    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+
+    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
+    const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
+    const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+    const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
+    /// fill the shared memory
+    for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset ;
+      const size_t tensor_index  =  plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
+      if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
+        local_acc[local_index] = device_evaluator.coeff(tensor_index);
+      }
+      else local_acc[local_index]=0.0f;
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution
+    const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
+    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      const size_t index = plane_kernel_offset+ itemID.get_local(0);
+      for (size_t k = 0; k < kernelSize; ++k) {
+        result += (local_acc[k + index] * kernel_ptr[k]);
+      }
+      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
+      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+
+template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel2D{
+typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+Kernel_accessor kernel_filter;
+const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
+Buffer_accessor buffer_acc;
+Local_accessor local_acc;
+FunctorExpr functors;
+TupleType tuple_of_accessors;
+EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
+  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
+  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+
+    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
+    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
+    const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
+    const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
+
+    /// fill the shared memory
+    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
+    for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
+      for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+        const size_t local_index = i + local_input_offset;
+        const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
+        if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
+          local_acc[local_index] = device_evaluator.coeff(tensor_index);
+        }
+        else local_acc[local_index]=0.0f;
+    }
+  }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution
+    const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
+    const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
+    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      for (size_t j = 0; j < kernelSize_y; j++) {
+        size_t kernel_offset =kernelSize_x * j;
+        const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
+        for (size_t i = 0; i < kernelSize_x; i++) {
+        result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
+        }
+      }
+      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
+      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+
+
+template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel3D{
+typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+Kernel_accessor kernel_filter;
+const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
+Buffer_accessor buffer_acc;
+Local_accessor local_acc;
+FunctorExpr functors;
+TupleType tuple_of_accessors;
+EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
+  const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
+  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
+  kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
+  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+
+    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
+    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
+    const size_t num_z_input =  (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
+    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
+    const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
+    for(size_t p=0; p<numP; p++){
+      /// fill the shared memory
+      const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
+        for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
+          for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+            const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
+            const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
+            if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) &&  ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
+              local_acc[local_index] = device_evaluator.coeff(tensor_index);
+            }
+            else local_acc[local_index]=0.0f;
+          }
+        }
+      }
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+      // calculate the convolution
+      const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
+      const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
+      const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
+
+      if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+        CoeffReturnType result = static_cast<CoeffReturnType>(0);
+        for (size_t k = 0; k < kernelSize_z; k++) {
+          for (size_t j = 0; j < kernelSize_y; j++) {
+            for (size_t i = 0; i < kernelSize_x; i++) {
+              const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
+              const size_t local_index = ((i+ itemID.get_local(0))+  num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
+              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+            }
+          }
+        }
+        const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
+        +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
+        buffer_ptr[tensor_index] = result;
+      }
+
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+  }
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef const Eigen::SyclDevice Device;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
+    PacketAccess = false,
+    Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  void executeEval(Scalar* data) const {
+    typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
+    typedef typename InputEvaluator::Dimensions InputDims;
+
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
+    // extract input functor list
+    InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
+
+
+    m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+
+      typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
+      /// work-around for gcc 4.8 auto bug
+      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
+      // create input tuple of accessors
+      InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
+
+      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
+      OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
+      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
+      KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
+
+      switch (NumKernelDims) {
+        case 1: {
+          const size_t numX = dimensions()[m_indices[0]];
+          const size_t numP = dimensions().TotalSize() / numX;
+          const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
+          m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
+          const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
+          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+          auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
+          auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
+          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
+          const array<Index, 1> indices{{m_indices[0]}};
+          const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+          internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+          cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
+          EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
+          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
+          indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+          break;
+        }
+
+        case 2: {
+          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
+          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
+          const size_t numX = dimensions()[m_indices[idxX]];
+          const size_t numY = dimensions()[m_indices[idxY]];
+          const size_t numP = dimensions().TotalSize() / (numX*numY);
+          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
+          m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
+          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
+          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
+          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
+          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
+          const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
+          const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
+          internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
+          EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
+          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
+          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+          break;
+        }
+
+        case 3: {
+          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+          const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
+          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
+          const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+          const size_t numX = dimensions()[m_indices[idxX]];
+          const size_t numY = dimensions()[m_indices[idxY]];
+          const size_t numZ = dimensions()[m_indices[idxZ]];
+          const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
+          const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
+          const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
+          internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
+          m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
+          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
+          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
+          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
+          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
+          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
+          EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
+          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
+          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, kernel_size_z, numX, numY,
+          numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+          break;
+        }
+
+        default: {
+          EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+        }
+      }
+    });
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator& operator = (const TensorEvaluator&);
+  TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
+  KernelArgType m_kernelArg;
+  TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Eigen::SyclDevice& m_device;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -19,12 +19,9 @@ namespace Eigen {

  #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))

-  template <typename Scalar> class MemCopyFunctor {
+  template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
  public:
-    typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
-    typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-
-    MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
+    MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}

    void operator()(cl::sycl::nd_item<1> itemID) {
      auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
@ -62,7 +59,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device
    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
    auto s=  (*it).template get_info<cl::sycl::info::device::vendor>();
    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp
+    if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
      it=devices.erase(it);
    }
    else{
@ -133,11 +130,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = buffer_map.find(static_cast<const uint8_t*>(p));
    if (it != buffer_map.end()) {
-      auto num_bytes =it->second.get_size();
      buffer_map.erase(it);
-      // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version
-      std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1>
-      a1.deallocate(static_cast<uint8_t*>(p), num_bytes);
    }
  }

@ -158,7 +151,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
        if((it->first <  (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
      }
    }
-    std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling allocate function in SyclDevice"<< std::endl;
+    std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
    abort();
  }

@ -197,7 +190,12 @@ struct SyclDevice {
  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
  template<typename Index>
  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
-    tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
+    tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
+    auto s=  sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
+    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+      tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
+    }
    rng = n;
    if (rng==0) rng=static_cast<Index>(1);
    GRange=rng;
@ -207,6 +205,74 @@ struct SyclDevice {
      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
    }
  }
+
+  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+  template<typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
+    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
+    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
+    }
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
+    rng1=dim1;
+    if (rng1==0 ) rng1=static_cast<Index>(1);
+    GRange1=rng1;
+    if (tileSize1>GRange1) tileSize1=GRange1;
+    else if(GRange1>tileSize1){
+      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
+      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+    }
+    tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
+    rng0 = dim0;
+    if (rng0==0 ) rng0=static_cast<Index>(1);
+    GRange0=rng0;
+    if (tileSize0>GRange0) tileSize0=GRange0;
+    else if(GRange0>tileSize0){
+      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
+      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+    }
+  }
+
+
+
+  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+  template<typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
+    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
+    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
+    }
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
+    rng2=dim2;
+    if (rng2==0 ) rng1=static_cast<Index>(1);
+    GRange2=rng2;
+    if (tileSize2>GRange2) tileSize2=GRange2;
+    else if(GRange2>tileSize2){
+      Index xMode =  static_cast<Index>(GRange2 % tileSize2);
+      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+    }
+    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
+    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
+    rng1=dim1;
+    if (rng1==0 ) rng1=static_cast<Index>(1);
+    GRange1=rng1;
+    if (tileSize1>GRange1) tileSize1=GRange1;
+    else if(GRange1>tileSize1){
+      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
+      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+    }
+    tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
+    rng0 = dim0;
+    if (rng0==0 ) rng0=static_cast<Index>(1);
+    GRange0=rng0;
+    if (tileSize0>GRange0) tileSize0=GRange0;
+    else if(GRange0>tileSize0){
+      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
+      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+    }
+  }
  /// allocate device memory
  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
      return m_queue_stream->allocate(num_bytes);
@ -220,21 +286,23 @@ struct SyclDevice {
  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }

  /// the memcpy function
-  template<typename T> EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const {
-    auto it1 = m_queue_stream->find_buffer((void*)src);
+  template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src));
    auto it2 = m_queue_stream->find_buffer(dst);
    auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
    auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
-    offset/=sizeof(T);
-    i/=sizeof(T);
+    offset/=sizeof(Index);
+    i/=sizeof(Index);
    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
+    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
    sycl_queue().submit([&](cl::sycl::handler &cgh) {
      auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-      auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, i, offset));
+      auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+      typedef decltype(src_acc) read_accessor;
+      typedef decltype(dst_acc) write_accessor;
+      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
    });
-    asynchronousExec();
+    synchronize();
  }

  /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
@ -242,7 +310,7 @@ struct SyclDevice {
  /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
  /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
  /// this buffer is accessed, the data will be copied to the device.
-  template<typename T> EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+  template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
    auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
    ::memcpy(host_acc.get_pointer(), src, n);
  }
@ -252,20 +320,22 @@ struct SyclDevice {
  /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
  /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
  /// to the cpu only once per function call.
-  template<typename T> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const T *src, size_t n) const {
+  template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
    auto it = m_queue_stream->find_buffer(src);
    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
-    offset/=sizeof(T);
+    offset/=sizeof(Index);
    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
+    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
    // Assuming that the dst is the start of the destination pointer
    auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
    sycl_queue().submit([&](cl::sycl::handler &cgh) {
      auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
      auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
+      typedef decltype(src_acc) read_accessor;
+      typedef decltype(dst_acc) write_accessor;
+      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
    });
-    asynchronousExec();
+    synchronize();
  }
  /// returning the sycl queue
  EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
@ -274,7 +344,7 @@ struct SyclDevice {
    size_t rng, GRange, tileSize;
    parallel_for_setup(n, tileSize, rng, GRange);
    sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
-    asynchronousExec();
+    synchronize();
  }

  struct memsetCghFunctor{
@ -300,6 +370,24 @@ struct SyclDevice {
    // there is no l3 cache on cuda devices.
    return firstLevelCacheSize();
  }
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
+  //  return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+
+  //  return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+  //  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
+  //  return stream_->deviceProperties().sharedMemPerBlock;
+  }
  /// No need for sycl it should act the same as CPU version
  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }

@ -307,9 +395,12 @@ struct SyclDevice {
    sycl_queue().wait_and_throw(); //pass
  }

-    EIGEN_STRONG_INLINE void asynchronousExec() const {
-      sycl_queue().throw_asynchronous();//pass
-    }
+  EIGEN_STRONG_INLINE void asynchronousExec() const {
+    ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
+    //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
+    sycl_queue().wait_and_throw(); //pass
+
+  }
  // This function checks if the runtime recorded an error for the
  // underlying stream device.
  EIGEN_STRONG_INLINE bool ok() const {
@ -318,6 +409,7 @@ struct SyclDevice {
 };


+
 }  // end namespace Eigen

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -26,8 +26,8 @@ namespace Eigen {
 /// Therefore, by adding the default value, we managed to convert the type and it does not break any
 /// existing code as its default value is T*.
 namespace internal {
-template<typename XprType, template <class> class MakePointer_>
-struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
 {
  // Type promotion to handle the case where the types of the lhs and the rhs are different.
  typedef typename XprType::Scalar Scalar;
@ -42,33 +42,26 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
  enum {
    Flags = 0
  };
-  template <class T> struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-    typedef typename MakePointerT::RefType RefType;
-
-  };
 };

-template<typename XprType, template <class> class MakePointer_>
-struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
+  typedef const TensorForcedEvalOp<XprType>& type;
 };

-template<typename XprType, template <class> class MakePointer_>
-struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
 {
-  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
+  typedef TensorForcedEvalOp<XprType> type;
 };

 }  // end namespace internal



-template<typename XprType, template <class> class MakePointer_>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
  public:
  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi
 };


-template<typename ArgType, typename Device, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
 {
-  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
+  typedef TensorForcedEvalOp<ArgType> XprType;
  typedef typename ArgType::Scalar Scalar;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
  typedef typename XprType::Index Index;
@ -150,17 +143,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
  }

-  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }

  /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
  /// used by sycl in order to build the sycl buffer
-  const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
 private:
  TensorEvaluator<ArgType, Device> m_impl;
  const ArgType m_op;
  const Device& m_device;
-  typename MakePointer<CoeffReturnType>::Type m_buffer;
+  CoeffReturnType* m_buffer;
 };


--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -75,7 +75,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;

 template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
+template<typename XprType> class TensorForcedEvalOp;

 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@ -205,6 +205,8 @@ class TensorIntDivisor<int32_t, true> {
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
 #ifdef __CUDA_ARCH__
    return (__umulhi(magic, n) >> shift);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
 #else
    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
    return (static_cast<uint32_t>(v >> 32) >> shift);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -711,6 +711,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 {
  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
  static const int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;

  enum {
    // Alignment can't be guaranteed at compile time since it depends on the
@ -733,7 +739,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
      }else{
-        /* implies m_strides[i]<0 by assert */
+      /* implies m_strides[i]<0 by assert */
        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
      }
@ -796,13 +802,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
                                          sizeof(Scalar));
  }

-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Strides Dimensions;
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }


@ -858,7 +857,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  }

  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef __SYCL_DEVICE_ONLY__
    return numext::maxi(min, numext::mini(max,value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
  }

  array<Index, NumDims> m_outputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@ -25,11 +25,11 @@
 namespace Eigen {
 namespace internal {

-template<typename CoeffReturnType> struct syclGenericBufferReducer{
+template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
  do {
-          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+          auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                    cl::sycl::range<1>{std::min(length, local)}};
            /* Two accessors are used: one to the buffer that is being reduced,
@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de

            /* The parallel_for invocation chosen is the variant with an nd_item
             * parameter, since the code requires barriers for correctness. */
-            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch,  length, local));
+            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch,  length, local));
          };
            dev.sycl_queue().submit(f);
            dev.asynchronousExec();
@ -54,13 +54,18 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
          length = length / local;

        } while (length > 1);
-
-
-
 }

 };

+template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
+template<typename BufferTOut, typename BufferTIn>
+static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+   syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
+    bufOut, bufI, dev, length, local);
+}
+};
+
 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
@ -74,8 +79,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {

  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    auto functors = TensorSycl::internal::extractFunctors(self.impl());
-    typedef decltype(functors) FunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
+    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
    size_t inputSize =self.impl().dimensions().TotalSize();
    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@ -108,9 +113,10 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
  //  Dims dims= self.xprDims();
    //Op functor = reducer;
    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+      // this is a workaround for gcc 4.8 bug
+      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      typedef decltype(tuple_of_accessors) TupleType;
+      TupleType tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
      typedef decltype(tmp_global_accessor) OutAccessor;
      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
@ -122,7 +128,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
    // getting final out buffer at the moment the created buffer is true because there is no need for assign
    auto out_buffer =dev.get_sycl_buffer(output);
    /// This is used to recursively reduce the tmp value to an element of 1;
-    syclGenericBufferReducer<CoeffReturnType>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+    syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
  }

 };
@ -134,10 +140,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
  typedef typename Self::CoeffReturnType CoeffReturnType;
  static const bool HasOptimizedImplementation = false;

-  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
+  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    auto functors = TensorSycl::internal::extractFunctors(self.impl());
-    typedef decltype(functors) FunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
+    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
    typename Self::Index range, GRange, tileSize;
    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;

@ -147,14 +153,15 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
    /// recursively apply reduction on it in order to reduce the whole.
      dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
      dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+      // this is workaround for gcc 4.8 bug.
+      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc;
+      Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
-
+      Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
      TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
-      (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
+      (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));

    });
    dev.asynchronousExec();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }

+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from sycl device
+  ReverseDimensions functor() const { return m_reverse; }
+
 protected:
  Dimensions m_dimensions;
  array<Index, NumDims> m_strides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@ -126,7 +126,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 	}
 	else 
          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
      }
      m_dimensions = nbDimensions;
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+      : m_impl(op.expression(), device), m_strides(op.strides())
  {
    m_dimensions = m_impl.dimensions();
    for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
    }

    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>

  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }

+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// required by sycl in order to extract the accessor
+  Strides functor() const { return m_strides; }
+
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
@ -250,9 +255,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  array<Index, NumDims> m_outputStrides;
  array<Index, NumDims> m_inputStrides;
  TensorEvaluator<ArgType, Device> m_impl;
+  const Strides m_strides;
 };

-
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@ -286,6 +291,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
    return this->m_impl.coeffRef(this->srcCoeff(index));
  }

+  /// required by sycl in order to extract the accessor
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
+  /// required by sycl in order to extract the accessor
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
+
  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketReturnType& x)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@ -35,7 +35,7 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {

-  template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
+  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;


 /// This struct is used for special expression nodes with no operations (for example assign and selectOP).
@ -80,6 +80,9 @@ template<typename T> struct GetType<false, T>{
 /// this is used for extracting tensor reduction
 #include "TensorReductionSycl.h"

+/// this is used for extracting tensor convolution
+#include "TensorConvolutionSycl.h"
+
 // kernel execution using fusion
 #include "TensorSyclRun.h"
 //sycl functors
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@ -97,8 +97,18 @@ template <typename Expr>\
 struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
 : DeviceConvertor<ExprNode, Res, Expr>{};

-KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
-KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
+#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
+template <typename Expr>\
+struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
+  typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
+};
+KERNELBROKERCONVERTFORCEDEVAL(const)
+KERNELBROKERCONVERTFORCEDEVAL()
+#undef KERNELBROKERCONVERTFORCEDEVAL
+
+
+
 KERNELBROKERCONVERT(const, true, TensorEvalToOp)
 KERNELBROKERCONVERT(, false, TensorEvalToOp)
 #undef KERNELBROKERCONVERT
@ -136,6 +146,18 @@ KERNELBROKERCONVERTERSLICESTRIDEOP()
 #undef KERNELBROKERCONVERTERSLICESTRIDEOP


+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
+#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
+template <DenseIndex DimId, typename Expr>\
+struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
+  typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
+};
+KERNELBROKERCONVERTCHIPPINGOP(const)
+KERNELBROKERCONVERTCHIPPINGOP()
+#undef KERNELBROKERCONVERTCHIPPINGOP
+
+
+
 }  // namespace internal
 }  // namespace TensorSycl
 }  // namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@ -188,6 +188,28 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual
 ASSIGN(const)
 ASSIGN()
 #undef ASSIGN
+
+
+
+
+ /// specialisation of the \ref ExprConstructor struct when the node type is
+ /// const TensorAssignOp
+ #define CONVERSIONEXPRCONST(CVQual)\
+ template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
+ struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>,  CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
+   typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
+   typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type>  Type;\
+   my_nested_type nestedExpr;\
+   Type expr;\
+   template <typename FuncDetector>\
+   ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+   : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
+  };
+
+  CONVERSIONEXPRCONST(const)
+  CONVERSIONEXPRCONST()
+  #undef CONVERSIONEXPRCONST
+
 /// specialisation of the \ref ExprConstructor struct when the node type is
 ///  TensorEvalToOp /// 0 here is the output number in the buffer
 #define EVALTO(CVQual)\
@ -212,10 +234,10 @@ EVALTO()
 /// TensorForcedEvalOp
 #define FORCEDEVAL(CVQual)\
 template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
 CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
-  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
+  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
+  TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR()
 #undef SYCLREDUCTIONEXPR


+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorContractionOp
+#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
+template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
+struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
+CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType,  RhsXprType>, N>, Params...> {\
+  static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
+  typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
+  NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
+  typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
+  Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+};
+
+SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTION
+
+

 #define SYCLSLICEOPEXPR(CVQual)\
 template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
@ -322,6 +368,23 @@ SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
 #undef SYCLPADDINGOPEXPRCONST


+// TensorChippingOp
+#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
+template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
+struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
+  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
+  typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
+  my_xpr_type xprExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
+};
+
+SYCLTENSORCHIPPINGOPEXPR(const)
+SYCLTENSORCHIPPINGOPEXPR()
+#undef SYCLTENSORCHIPPINGOPEXPR
+

 /// template deduction for \ref ExprConstructor struct
 template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@ -35,6 +35,8 @@
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
+#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
+
 /// \struct ExtractAccessor: Extract Accessor Class is used to extract the
 /// accessor from a buffer.
 /// Depending on the type of the leaf node we can get a read accessor or a
@ -44,22 +46,16 @@ struct ExtractAccessor;

 struct AccessorConstructor{
  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
-  -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
-  return ExtractAccessor<Arg>::getTuple(cgh, eval);
-  }
+  RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))

  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
-  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
-    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
-  }
+  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
+
  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
-  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
-    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
-  }
+  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
+
  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
-  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))){
-    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()));
-  }
+  RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
 };

 /// specialisation of the \ref ExtractAccessor struct when the node type is
@ -68,9 +64,7 @@ struct AccessorConstructor{
 template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.impl());\
-  }\
+RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };

 SYCLUNARYCATEGORYEXTACC(const)
@ -83,9 +77,7 @@ SYCLUNARYCATEGORYEXTACC()
 template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\
-  }\
+  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
 };

 SYCLBINARYCATEGORYEXTACC(const)
@ -98,9 +90,7 @@ SYCLBINARYCATEGORYEXTACC()
 template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());\
-  }\
+  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
 };

 SYCLTERNARYCATEGORYEXTACC(const)
@ -114,9 +104,7 @@ SYCLTERNARYCATEGORYEXTACC()
 template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());\
-  }\
+  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
 };

 SYCLSELECTOPEXTACC(const)
@ -128,9 +116,7 @@ SYCLSELECTOPEXTACC()
 template <typename LHSExpr, typename RHSExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\
- }\
+  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
 };

 SYCLTENSORASSIGNOPEXTACC(const)
@ -142,9 +128,7 @@ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>,
 template <typename PlainObjectType, int Options_, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
-  -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
-    return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
-  }\
+  RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
 };

 TENSORMAPEXPR(const, cl::sycl::access::mode::read)
@ -156,9 +140,7 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
 template <typename Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
-    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
-  }\
+  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };

 SYCLFORCEDEVALEXTACC(const)
@ -171,9 +153,7 @@ SYCLFORCEDEVALEXTACC()
 template <typename Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
-  -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){\
-    return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));\
-  }\
+  RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
 };

 SYCLEVALTOEXTACC(const)
@ -185,43 +165,66 @@ SYCLEVALTOEXTACC()
 template <typename OP, typename Dim, typename Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
-  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
-    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
-  }\
+  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };

 SYCLREDUCTIONEXTACC(const)
 SYCLREDUCTIONEXTACC()
 #undef SYCLREDUCTIONEXTACC

+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
+#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
+ struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
+  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
+
+
 /// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp. This is a special case where there is no OP
+/// const TensorSlicingOp.
 #define SYCLSLICEOPEXTACC(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.impl());\
-  }\
+  RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
 };

 SYCLSLICEOPEXTACC(const)
 SYCLSLICEOPEXTACC()
 #undef SYCLSLICEOPEXTACC
-
+// specialisation of the \ref ExtractAccessor struct when the node type is
+///  TensorStridingSlicingOp.
 #define SYCLSLICESTRIDEOPEXTACC(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
-    return AccessorConstructor::getTuple(cgh, eval.impl());\
-  }\
+  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };

 SYCLSLICESTRIDEOPEXTACC(const)
 SYCLSLICESTRIDEOPEXTACC()
 #undef SYCLSLICESTRIDEOPEXTACC

+// specialisation of the \ref ExtractAccessor struct when the node type is
+/// TensorChippingOp.
+#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
+template<DenseIndex DimId, typename XprType, typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
+  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+};
+
+SYCLTENSORCHIPPINGOPEXTACC(const)
+SYCLTENSORCHIPPINGOPEXTACC()
+#undef SYCLTENSORCHIPPINGOPEXTACC
+

 /// template deduction for \ref ExtractAccessor
 template <typename Evaluator>
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@ -42,6 +42,20 @@ template <typename Evaluator> struct FunctorExtractor{

 };

+/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
+///TensorConversionOp
+#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
+template <typename ArgType1, typename ArgType2, typename Dev>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
+  FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
+  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
+  : subExpr(expr.impl()) {}\
+};
+
+SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
+SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
+#undef SYCLEXTRFUNCCONVERSION
+
 #define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
 template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
 struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const)
 SYCLEXTRFUNCREDUCTIONOP()
 #undef SYCLEXTRFUNCREDUCTIONOP

+#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
+struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
+  typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
+  typedef typename Evaluator::Dimensions Dimensions;\
+  const Dimensions m_dimensions;\
+  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
+  : m_dimensions(expr.dimensions()) {}\
+};
+
+
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
+SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
+#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
+
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCTSLICEOP(CVQual)\
@ -253,14 +285,27 @@ struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>,
  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
 };

-// TensorContractionOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),)
 // TensorConcatenationOp
 SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
 SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
 #undef SYCLEXTRFUNCCONTRACTCONCAT

+//TensorChippingOp
+#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
+template<DenseIndex DimId, typename XprType, typename Device>\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\
+  FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
+  const DenseIndex m_dim;\
+  const DenseIndex m_offset;\
+  EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
+  EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
+  FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
+  : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
+};
+
+SYCLEXTRFUNCCHIPPINGOP(const)
+SYCLEXTRFUNCCHIPPINGOP()
+#undef SYCLEXTRFUNCCHIPPINGOP

 /// template deduction function for FunctorExtractor
 template <typename Evaluator>
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@ -18,13 +18,14 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {

-  template<typename CoeffReturnType, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
+    OP op;
    OutputAccessor aOut;
    InputAccessor aI;
    LocalAccessor scratch;
    size_t length, local;
-    GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
-    : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+    GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+    : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
    void operator()(cl::sycl::nd_item<1> itemID) {
      size_t globalid = itemID.get_global(0);
      size_t localid = itemID.get_local(0);
@ -44,7 +45,12 @@ namespace internal {
        auto min = (length < local) ? length : local;
        for (size_t offset = min / 2; offset > 0; offset /= 2) {
          if (localid < offset) {
-            scratch[localid] += scratch[localid + offset];
+            auto accum = op.initialize();
+            op.reduce(scratch[localid], &accum);
+            op.reduce(scratch[localid + offset], &accum);
+            op.finalize(accum);
+            scratch[localid]=accum;
+            //scratch[localid] += scratch[localid + offset];
          }
          itemID.barrier(cl::sycl::access::fence_space::local_space);
        }
@ -66,7 +72,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
+  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {

@ -99,6 +105,46 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
  Index range;
 };

+template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
+class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
+ public:
+  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+  typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
+  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
+    Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
+  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
+  void operator()(cl::sycl::nd_item<1> itemID) {
+
+    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+    /// the device_evaluator is detectable and recognisable on the device.
+    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
+    /// const cast added as a naive solution to solve the qualifier drop error
+    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid< range) {
+      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
+      functor.finalize(accum);
+      output_accessor_ptr[globalid]= accum/num_values_to_reduce;
+    }
+  }
+ private:
+  write_accessor output_accessor;
+  FunctorExpr functors;
+  Tuple_of_Acc tuple_of_accessors;
+  Dims dims;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};

 template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
 class FullReductionKernelFunctor{
@ -128,18 +174,70 @@ public:
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=itemID.get_global_linear_id();

-    if(globalid<rng)
-      tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
-    else
-      tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+    tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
+    : static_cast<CoeffReturnType>(op.initialize());

-    if(remaining!=0 && globalid==0 )
+    if(remaining!=0 && globalid==0 ){
      // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
+      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+      auto accum = op.initialize();
+      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+      op.reduce(remaining_reduce, &accum);
+      op.finalize(accum);
+      tmp_global_accessor.get_pointer()[0]=accum;
+
+    }
  }
 };

+template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr,  typename Dims, typename Index, typename TupleType>
+class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
+public:
+  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

+  OutAccessor tmp_global_accessor;
+  Index rng , remaining, red_factor;
+  Op op;
+  Dims dims;
+  FunctorExpr functors;
+  TupleType tuple_of_accessors;
+
+  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
+  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+
+    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
+    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+    /// the device_evaluator is detectable and recognisable on the device.
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    /// const cast added as a naive solution to solve the qualifier drop error
+    auto globalid=itemID.get_global_linear_id();
+    auto scale = (rng*red_factor) + remaining;
+
+    tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
+    :static_cast<CoeffReturnType>(op.initialize())/scale;
+
+    if(remaining!=0 && globalid==0 ){
+      // this will add the rest of input buffer when the input size is not devidable to red_factor.
+      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
+      auto accum = op.initialize();
+      tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
+      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
+      op.reduce(remaining_reduce, &accum);
+      op.finalize(accum);
+      tmp_global_accessor.get_pointer()[0]=accum/scale;
+
+    }
+  }
+};

 }
 }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const)
 REDUCTIONLEAFCOUNT()
 #undef REDUCTIONLEAFCOUNT

+/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
+#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
+template <typename Indices, typename LhsXprType, typename RhsXprType>\
+struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
+    static const size_t Count =1;\
+};
+
+CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
+CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
+#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
+
+
+
 /// specialisation of the \ref LeafCount struct when the node type is  TensorSlicingOp
 #define SLICEOPLEAFCOUNT(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType>\
@ -124,6 +139,17 @@ SLICEOPLEAFCOUNT(const)
 SLICEOPLEAFCOUNT()
 #undef SLICEOPLEAFCOUNT

+
+/// specialisation of the \ref LeafCount struct when the node type is  TensorChippingOp
+#define CHIPPINGOPLEAFCOUNT(CVQual)\
+template <DenseIndex DimId, typename XprType>\
+struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
+
+CHIPPINGOPLEAFCOUNT(const)
+CHIPPINGOPLEAFCOUNT()
+#undef CHIPPINGOPLEAFCOUNT
+
+
 #define SLICESTRIDEOPLEAFCOUNT(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
 struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@ -156,6 +156,18 @@ EVALTO()
 #undef EVALTO


+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorChippingOp
+#define CHIPPINGOP(CVQual)\
+template <DenseIndex DimId, typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
+  typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
+};
+
+CHIPPINGOP(const)
+CHIPPINGOP()
+#undef CHIPPINGOP
+
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorReductionOp
 #define SYCLREDUCTION(CVQual)\
@ -168,6 +180,20 @@ SYCLREDUCTION()
 #undef SYCLREDUCTION


+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorReductionOp
+#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
+template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
+struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
+  typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
+};
+SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
+SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
+#undef SYCLCONTRACTIONCONVOLUTIONPLH
+
+
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorCwiseSelectOp
 #define SLICEOPEXPR(CVQual)\
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@ -49,19 +49,38 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
 /// based expression tree;
 /// creates the expression tree for the device with accessor to buffers;
 /// construct the kernel and submit it to the sycl queue.
+/// std::array does not have TotalSize. So I have to get the size through template specialisation.
+template<typename , typename Dimensions> struct DimensionSize{
+  static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
+    return dim.TotalSize();
+  }
+};
+#define DIMSIZEMACRO(CVQual)\
+template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
+  static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
+    return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
+  }\
+};
+
+DIMSIZEMACRO(const)
+DIMSIZEMACRO()
+#undef DIMSIZEMACRO
+
+
 template <typename Expr, typename Dev>
 void run(Expr &expr, Dev &dev) {
  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign) {
-    typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr;
+    typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
    FunctorExpr functors = internal::extractFunctors(evaluator);
    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
      // create a tuple of accessors from Evaluator
-      typedef decltype(internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator)) TupleType;
-      TupleType tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+      typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
+      TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
      typename Expr::Index range, GRange, tileSize;
-      dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange);
+      typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
+      dev.parallel_for_setup(total_size, tileSize, range, GRange);

      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
      ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@ -61,10 +61,11 @@ struct MatrixExponentialScalingOp
 *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
 *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
 */
-template <typename MatrixType>
-void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V)
 {
-  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar;
  const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
  const MatrixType A2 = A * A;
  const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
@ -77,9 +78,10 @@ void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V)
 *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
 *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
 */
-template <typename MatrixType>
-void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
  const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
  const MatrixType A2 = A * A;
@ -94,9 +96,10 @@ void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V)
 *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
 *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
 */
-template <typename MatrixType>
-void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
  const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
  const MatrixType A2 = A * A;
@ -114,9 +117,10 @@ void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V)
 *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
 *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
 */
-template <typename MatrixType>
-void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
  const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
                          2162160.L, 110880.L, 3960.L, 90.L, 1.L};
@ -135,9 +139,10 @@ void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V)
 *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
 *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
 */
-template <typename MatrixType>
-void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
  const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
                          1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
@ -162,9 +167,10 @@ void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V)
 *  This function activates only if your long double is double-double or quadruple.
 */
 #if LDBL_MANT_DIG > 64
-template <typename MatrixType>
-void matrix_exp_pade17(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
  const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
                          100610229646136770560000.L, 15720348382208870400000.L,
@ -204,7 +210,8 @@ struct matrix_exp_computeUV
 template <typename MatrixType>
 struct matrix_exp_computeUV<MatrixType, float>
 {
-  static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
  {
    using std::frexp;
    using std::pow;
@ -227,7 +234,8 @@ struct matrix_exp_computeUV<MatrixType, float>
 template <typename MatrixType>
 struct matrix_exp_computeUV<MatrixType, double>
 {
-  static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
  {
    using std::frexp;
    using std::pow;
@ -254,7 +262,8 @@ struct matrix_exp_computeUV<MatrixType, double>
 template <typename MatrixType>
 struct matrix_exp_computeUV<MatrixType, long double>
 {
-  static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings)
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
  {
 #if   LDBL_MANT_DIG == 53   // double precision
    matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
@ -339,9 +348,10 @@ struct matrix_exp_computeUV<MatrixType, long double>
 * \param arg    argument of matrix exponential (should be plain object)
 * \param result variable in which result will be stored
 */
-template <typename MatrixType, typename ResultType> 
-void matrix_exp_compute(const MatrixType& arg, ResultType &result)
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result)
 {
+  typedef typename ArgType::PlainObject MatrixType;
 #if LDBL_MANT_DIG > 112 // rarely happens
  typedef typename traits<MatrixType>::Scalar Scalar;
  typedef typename NumTraits<Scalar>::Real RealScalar;
@ -354,7 +364,7 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result)
  MatrixType U, V;
  int squarings; 
  matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
-  MatrixType numer = U + V;   
+  MatrixType numer = U + V;
  MatrixType denom = -U + V;
  result = denom.partialPivLu().solve(numer);
  for (int i=0; i<squarings; i++)
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -163,6 +163,10 @@ if(EIGEN_TEST_CXX11)
    ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
    ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
    ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11")
  endif(EIGEN_TEST_SYCL)
  # It should be safe to always run these tests as there is some fallback code for
  # older compiler that don't support cxx11.
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@ -131,11 +131,6 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-
-  test_broadcast_sycl_fixed<DataType, RowMajor, int>(sycl_device);
-  test_broadcast_sycl<DataType, RowMajor, int>(sycl_device);
-  test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device);
-  test_broadcast_sycl<DataType, ColMajor, int>(sycl_device);
  test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -32,20 +32,20 @@ template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return 1 / x; }
 }

-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR)                 \
+#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout)         \
  {                                                                            \
    /* out OPERATOR in.FUNC() */                                               \
-    Tensor<SCALAR, 3> in(tensorRange);                                         \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in = in.random() + static_cast<SCALAR>(0.01);                              \
    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange);                   \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
                                   (in.size()) * sizeof(SCALAR));              \
    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
@ -53,7 +53,7 @@ template <typename T> T inverse(T x) { return 1 / x; }
    gpu_out.device(sycl_device) OPERATOR gpu.FUNC();                           \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver OPERATOR std::FUNC(in(i));                                           \
      VERIFY_IS_APPROX(out(i), ver);                                           \
@ -63,18 +63,18 @@ template <typename T> T inverse(T x) { return 1 / x; }
  }                                                                            \
  {                                                                            \
    /* out OPERATOR out.FUNC() */                                              \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
                                   (out.size()) * sizeof(SCALAR));             \
    gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC();                       \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver OPERATOR std::FUNC(reference(i));                                    \
      VERIFY_IS_APPROX(out(i), ver);                                           \
@ -82,61 +82,62 @@ template <typename T> T inverse(T x) { return 1 / x; }
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR)                         \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR)                     \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR)                    \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR)
+#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout)                \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout)            \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout)           \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)

-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC)                                \
+#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout)                        \
  {                                                                            \
    /* out = in.FUNC() */                                                      \
-    Tensor<SCALAR, 3> in(tensorRange);                                         \
-    Tensor<bool, 3> out(tensorRange);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
+    Tensor<bool, 3, Layout, int64_t> out(tensorRange);                         \
    in = in.random() + static_cast<SCALAR>(0.01);                              \
    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
    bool *gpu_data_out =                                                       \
        static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));  \
-    TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange);                   \
-    TensorMap<Tensor<bool, 3>> gpu_out(gpu_data_out, tensorRange);             \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
+    TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);    \
    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
                                   (in.size()) * sizeof(SCALAR));              \
    gpu_out.device(sycl_device) = gpu.FUNC();                                  \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(bool));               \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_EQUAL(out(i), std::FUNC(in(i)));                               \
    }                                                                          \
    sycl_device.deallocate(gpu_data);                                          \
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_UNARY_BUILTINS(SCALAR)                                            \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=)                                     \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =)                                      \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan)                                     \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite)                                  \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf)
+#define TEST_UNARY_BUILTINS(SCALAR, Layout)                                    \
+  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout)                             \
+  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout)                              \
+  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout)                             \
+  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout)                          \
+  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)

 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
-  int sizeDim1 = 10;
-  int sizeDim2 = 10;
-  int sizeDim3 = 10;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};

-  TEST_UNARY_BUILTINS(float)
+  TEST_UNARY_BUILTINS(float, RowMajor)
+  TEST_UNARY_BUILTINS(float, ColMajor)
 }

 namespace std {
@ -144,24 +145,24 @@ template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
 template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
 }

-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC)                                \
+#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout)                        \
  {                                                                            \
    /* out = in_1.FUNC(in_2) */                                                \
-    Tensor<SCALAR, 3> in_1(tensorRange);                                       \
-    Tensor<SCALAR, 3> in_2(tensorRange);                                       \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
@ -169,7 +170,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2);                           \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver = std::FUNC(in_1(i), in_2(i));                                       \
      VERIFY_IS_APPROX(out(i), ver);                                           \
@ -179,24 +180,24 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR)                       \
+#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout)               \
  {                                                                            \
    /* out = in_1 OPERATOR in_2 */                                             \
-    Tensor<SCALAR, 3> in_1(tensorRange);                                       \
-    Tensor<SCALAR, 3> in_2(tensorRange);                                       \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
@ -204,7 +205,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2;                        \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i));                      \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
@ -212,46 +213,48 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR)     \
+#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout)     \
  {                                                                            \
    /* out = in_1 OPERATOR 2 */                                                \
-    Tensor<SCALAR, 3> in_1(tensorRange);                                       \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    gpu_out.device(sycl_device) = gpu_1 OPERATOR 2;                            \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2);                            \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_BINARY_BUILTINS(SCALAR)                                           \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax)                                  \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin)                                  \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, +)                                    \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, -)                                    \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, *)                                    \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, /)
+#define TEST_BINARY_BUILTINS(SCALAR, Layout)                                   \
+  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout)                         \
+  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout)                         \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout)                           \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout)                           \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout)                           \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)

 static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
-  int sizeDim1 = 10;
-  int sizeDim2 = 10;
-  int sizeDim3 = 10;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_BINARY_BUILTINS(float)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %)
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  TEST_BINARY_BUILTINS(float, RowMajor)
+  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
+  TEST_BINARY_BUILTINS(float, ColMajor)
+  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
 }

 void test_cxx11_tensor_builtins_sycl() {
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@ -43,7 +43,7 @@ static void test_simple_chip()
  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
      for (int k = 0; k < 7; ++k) {
        for (int l = 0; l < 11; ++l) {
          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@ -75,7 +75,7 @@ static void test_simple_chip()
  for (int i = 0; i < 2; ++i) {
    for (int j = 0; j < 3; ++j) {
      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
        }
      }
@ -126,7 +126,7 @@ static void test_dynamic_chip()
  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
      for (int k = 0; k < 7; ++k) {
        for (int l = 0; l < 11; ++l) {
          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@ -158,7 +158,7 @@ static void test_dynamic_chip()
  for (int i = 0; i < 2; ++i) {
    for (int j = 0; j < 3; ++j) {
      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
        }
      }
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@ -0,0 +1,622 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
+  tensor.setRandom();
+  tensor1.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+  DataType* gpu_data_tensor1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim3; ++j) {
+      for (int k = 0; k < sizeDim4; ++k) {
+        for (int l = 0; l < sizeDim5; ++l) {
+          float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
+  Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
+  tensor2.setRandom();
+  const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
+  DataType* gpu_data_tensor2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim4; ++j) {
+      for (int k = 0; k < sizeDim5; ++k) {
+        float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
+        VERIFY_IS_EQUAL(chip2(i,j,k), expected);
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_tensor1);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_tensor2);
+  sycl_device.deallocate(gpu_data_chip2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
+  input1.setRandom();
+  input2.setRandom();
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
+  gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
+  input3.setRandom();
+
+  const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
+  DataType* gpu_data_input3  = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
+  gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
+  input4.setRandom();
+
+  const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
+  DataType* gpu_data_input4  = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
+  gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
+  input5.setRandom();
+
+  const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
+  DataType* gpu_data_input5  = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
+  gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
+  input6.setRandom();
+
+  const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
+  DataType* gpu_data_input6  = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
+  gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
+  input7.setRandom();
+
+  DataType* gpu_data_input7  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
+  gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_input1);
+  sycl_device.deallocate(gpu_data_input2);
+  sycl_device.deallocate(gpu_data_input3);
+  sycl_device.deallocate(gpu_data_input4);
+  sycl_device.deallocate(gpu_data_input5);
+  sycl_device.deallocate(gpu_data_input6);
+  sycl_device.deallocate(gpu_data_input7);
+
+}
+
+template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+void test_cxx11_tensor_chipping_sycl()
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+  }
+}
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -22,39 +22,39 @@

 using Eigen::Tensor;

-template<typename DataType, int DataLayout, typename Index>
+template<typename DataType, int DataLayout, typename IndexType>
 static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
 {
-  Index leftDim1 = 2;
-  Index leftDim2 = 3;
-  Index leftDim3 = 1;
-  Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
-  Index rightDim1 = 2;
-  Index rightDim2 = 3;
-  Index rightDim3 = 1;
-  Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  IndexType leftDim3 = 1;
+  Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  IndexType rightDim3 = 1;
+  Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};

-  //Index concatDim1 = 3;
-//	Index concatDim2 = 3;
-//	Index concatDim3 = 1;
-  //Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+  //IndexType concatDim1 = 3;
+//	IndexType concatDim2 = 3;
+//	IndexType concatDim3 = 1;
+  //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};

-  Tensor<DataType, 3, DataLayout, Index> left(leftRange);
-  Tensor<DataType, 3, DataLayout, Index> right(rightRange);
+  Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
  left.setRandom();
  right.setRandom();

  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));

-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
  ///
-  Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
  DataType * gpu_out_data1 =  static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());

  //concatenation = left.concatenate(right, 0);
  gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
@ -63,19 +63,19 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
  VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
  VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
-  for (int j = 0; j < 3; ++j) {
-    for (int i = 0; i < 2; ++i) {
+  for (IndexType j = 0; j < 3; ++j) {
+    for (IndexType i = 0; i < 2; ++i) {
      VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
    }
-    for (int i = 2; i < 4; ++i) {
+    for (IndexType i = 2; i < 4; ++i) {
      VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
    }
  }

  sycl_device.deallocate(gpu_out_data1);
-  Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
  DataType * gpu_out_data2 =  static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
  gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
  sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));

@ -83,18 +83,18 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
  VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
  VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
      VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
    }
-    for (int j = 3; j < 6; ++j) {
+    for (IndexType j = 3; j < 6; ++j) {
      VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
    }
  }
  sycl_device.deallocate(gpu_out_data2);
-  Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
  DataType * gpu_out_data3 =  static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
  gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
  sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));

@ -102,8 +102,8 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
  VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
  VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
      VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
      VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
    }
@ -112,25 +112,25 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  sycl_device.deallocate(gpu_in1_data);
  sycl_device.deallocate(gpu_in2_data);
 }
-template<typename DataType, int DataLayout, typename Index>
+template<typename DataType, int DataLayout, typename IndexType>
 static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
 {

-  Index leftDim1 = 2;
-  Index leftDim2 = 3;
-  Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}};
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};

-  Index rightDim1 = 2;
-  Index rightDim2 = 3;
-  Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};

-  Index concatDim1 = 4;
-  Index concatDim2 = 3;
-  Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}};
+  IndexType concatDim1 = 4;
+  IndexType concatDim2 = 3;
+  Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};

-  Tensor<DataType, 2, DataLayout, Index> left(leftRange);
-  Tensor<DataType, 2, DataLayout, Index> right(rightRange);
-  Tensor<DataType, 2, DataLayout, Index> result(resRange);
+  Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+  Tensor<DataType, 2, DataLayout, IndexType> result(resRange);

  left.setRandom();
  right.setRandom();
@ -141,9 +141,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));


-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);

  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
@ -154,8 +154,8 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
 sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
 sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
      VERIFY_IS_EQUAL(left(i, j), result(i, j));
      VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
    }
@ -169,9 +169,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
 template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_concatenation<DataType, RowMajor, int>(sycl_device);
-  test_simple_concatenation<DataType, ColMajor, int>(sycl_device);
-  test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device);
+  test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+  test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
 }
 void test_cxx11_tensor_concatenation_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include <iostream>
@ -28,56 +28,56 @@ using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-template<int DataLayout, typename Device>
-void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size)
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
+void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
 {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+  static const DataType error_threshold =1e-4f;
 //  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
  // with these dimensions, the output has 300 * 140 elements, which is
  // more than 30 * 1024, which is the number of threads in blocks on
  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
-  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
 //  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-  Eigen::array<int, 2> left_dims = {{m_size, k_size}};
-  Eigen::array<int, 2> right_dims = {{k_size, n_size}};
-  Eigen::array<int, 2> result_dims = {{m_size, n_size}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};

  t_left.setRandom();
  t_right.setRandom();

-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);

-  float * d_t_left  = static_cast<float*>(sycl_device.allocate(t_left_bytes));
-  float * d_t_right  = static_cast<float*>(sycl_device.allocate(t_right_bytes));
-  float * d_t_result =  static_cast<float*>(sycl_device.allocate(t_result_bytes));
+  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));

-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims);

  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);

  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
-  t_result = t_left.contract(t_right, dims);
-
  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);

+  t_result = t_left.contract(t_right, dims);

-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
      continue;
    }
-    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
      continue;
    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
              << " vs " <<  t_result_gpu(i) << std::endl;
    assert(false);
  }
@ -86,46 +86,114 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in
  sycl_device.deallocate(d_t_result);
 }

-
-template<int DataLayout, typename Device>
-void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
+void test_TF(const Device& sycl_device)
 {
-  //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 0, DataLayout> t_result;
-  Tensor<float, 0, DataLayout> t_result_gpu;
-  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
-  Eigen::array<int, 2> left_dims = {{m_size, k_size}};
-  Eigen::array<int, 2> right_dims = {{k_size, n_size}};
-  t_left.setRandom();
-  t_right.setRandom();
-
-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = sizeof(float);
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+  static const DataType error_threshold =1e-4f;
+  Eigen::array<IndexType, 2> left_dims = {{2, 3}};
+  Eigen::array<IndexType, 2> right_dims = {{3, 1}};
+  Eigen::array<IndexType, 2> res_dims = {{2, 1}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};


-  float * d_t_left  = static_cast<float*>(sycl_device.allocate(t_left_bytes));
-  float * d_t_right  = static_cast<float*>(sycl_device.allocate(t_right_bytes));
-  float * d_t_result =  static_cast<float*>(sycl_device.allocate(t_result_bytes));
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);

-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result);
+  t_left.data()[0] = 1.0f;
+  t_left.data()[1] = 2.0f;
+  t_left.data()[2] = 3.0f;
+  t_left.data()[3] = 4.0f;
+  t_left.data()[4] = 5.0f;
+  t_left.data()[5] = 6.0f;
+
+  t_right.data()[0] = -1.0f;
+  t_right.data()[1] = 0.5f;
+  t_right.data()[2] = 2.0f;
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size()*sizeof(DataType);
+
+
+  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims);

  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);

  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+
  t_result = t_left.contract(t_right, dims);

- sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
-  if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f &&
-      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+
+
+}
+
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
+void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
+{
+  //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+  static const DataType error_threshold =1e-4f;
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 0, DataLayout, IndexType> t_result;
+  Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = sizeof(DataType);
+
+
+  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
    std::cout << "mismatch detected: " << t_result()
              << " vs " <<  t_result_gpu() << std::endl;
    assert(false);
@ -137,47 +205,47 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
 }


-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_m(const Device& sycl_device) {
-  for (int k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128);
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128);
  }
 }

-template<int DataLayout, typename Device>
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_k(const Device& sycl_device) {
-  for (int k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128);
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128);
  }
 }

-template<int DataLayout, typename Device>
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_n(const Device& sycl_device) {
-  for (int k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k);
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k);
  }
 }


-template<int DataLayout, typename Device>
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_sizes(const Device& sycl_device) {
-  int m_sizes[] = { 31,  39,   63,   64,   65,
+  IndexType m_sizes[] = { 31,  39,   63,   64,   65,
                   127, 129,  255,  257 , 511,
                   512, 513, 1023, 1024, 1025};

-  int n_sizes[] = { 31,  39,   63,   64,   65,
+  IndexType n_sizes[] = { 31,  39,   63,   64,   65,
                   127, 129,  255,  257,  511,
                   512, 513, 1023, 1024, 1025};

-  int k_sizes[] = {  31,   39,  63,  64,   65,
+  IndexType k_sizes[] = {  31,   39,  63,  64,   65,
                     95,   96, 127, 129,  255,
                    257,  511, 512, 513, 1023,
                   1024, 1025};

-  for (int i = 0; i < 15; i++) {
-    for (int j = 0; j < 15; j++) {
-      for (int k = 0; k < 17; k++) {
-        test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
      }
    }
  }
@ -186,24 +254,27 @@ void test_sycl_contraction_sizes(const Device& sycl_device) {
 template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
  QueueInterface queueInterface(s);
  auto sycl_device=Eigen::SyclDevice(&queueInterface);
-  test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32);
-  test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32);
-  test_scalar<ColMajor>(sycl_device, 32, 32, 32);
-  test_scalar<RowMajor>(sycl_device, 32, 32, 32);
+  test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32);
+  test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
+  test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32);
+  test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
  std::chrono::time_point<std::chrono::system_clock> start, end;
  start = std::chrono::system_clock::now();
-  test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128);
-  test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128);
-  test_scalar<ColMajor>(sycl_device, 128, 128, 128);
-  test_scalar<RowMajor>(sycl_device, 128, 128, 128);
-  test_sycl_contraction_m<ColMajor>(sycl_device);
-  test_sycl_contraction_m<RowMajor>(sycl_device);
-  test_sycl_contraction_n<ColMajor>(sycl_device);
-  test_sycl_contraction_n<RowMajor>(sycl_device);
-  test_sycl_contraction_k<ColMajor>(sycl_device);
-  test_sycl_contraction_k<RowMajor>(sycl_device);
-  test_sycl_contraction_sizes<ColMajor>(sycl_device);
-  test_sycl_contraction_sizes<RowMajor>(sycl_device);
+  test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
+  test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
+  test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device);
+  test_TF<RowMajor, float, int64_t>(sycl_device);
+  test_TF<ColMajor, float, int64_t>(sycl_device);
+
  end = std::chrono::system_clock::now();
  std::chrono::duration<double> elapsed_seconds = end-start;
  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
@ -211,6 +282,7 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s
            << "elapsed time: " << elapsed_seconds.count() << "s\n";

 }
+
 void test_cxx11_tensor_contract_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(tensorContractionPerDevice(device));
--- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=55;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 1> kernel_dims = {{4}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 2> dims3{{0,1}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=49;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 1> kernel_dims = {{2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<DataType, 1, DataLayout, IndexType> same(3);
+  std::size_t same_bytes = same.size() * sizeof(DataType);
+  DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+  gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+
+  Tensor<DataType, 1, DataLayout, IndexType> full(5);
+  std::size_t full_bytes = full.size() * sizeof(DataType);
+  DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+  gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_valid);
+  sycl_device.deallocate(d_same);
+  sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+  Eigen::array<IndexType, 1> input_dims = {{13}};
+  Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+  Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 1> dims;
+  dims[0] = 0;
+
+  Eigen::array<IndexType, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<IndexType, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, RowMajor, int64_t>(sycl_device);
+  test_expr<float, ColMajor, int64_t>(sycl_device);
+  test_expr<float, RowMajor, int64_t>(sycl_device);
+  test_modes<float, ColMajor, int64_t>(sycl_device);
+  test_modes<float, RowMajor, int64_t>(sycl_device);
+  test_strides<float, ColMajor, int64_t>(sycl_device);
+  test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+void test_cxx11_tensor_convolution_sycl() {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConvolutionPerDevice(device));
+  }
+}
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -22,35 +22,35 @@
 #include <stdint.h>
 #include <iostream>

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  std::cout << "Running on : "
            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
            <<std::endl;
-  int sizeDim1 = 100;
-  array<int, 1> tensorRange = {{sizeDim1}};
-  Tensor<DataType, 1, DataLayout> in(tensorRange);
-  Tensor<DataType, 1, DataLayout> in1(tensorRange);
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
  memset(in1.data(), 1, in1.size() * sizeof(DataType));
  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
-  for (int i=0; i<in.size(); i++) {
+  for (IndexType i=0; i<in.size(); i++) {
    VERIFY_IS_EQUAL(in(i), in1(i));
  }
  sycl_device.deallocate(gpu_in_data);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
  VERIFY(sycl_device.ok());
-  int sizeDim1 = 100;
-  array<int, 1> tensorDims = {{sizeDim1}};
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorDims = {{sizeDim1}};
  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
  sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));

-  TensorMap<Tensor<DataType, 1, DataLayout>> in(gpu_data, tensorDims);
-  TensorMap<Tensor<DataType, 1, DataLayout>> out(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
  out.device(sycl_device) = in / in.constant(0);

  sycl_device.synchronize();
@ -62,8 +62,8 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_device_memory<DataType, RowMajor>(sycl_device);
-  test_device_memory<DataType, ColMajor>(sycl_device);
+  test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+  test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
  /// this test throw an exception. enable it if you want to see the exception
  //test_device_exceptions<DataType, RowMajor>(sycl_device);
  /// this test throw an exception. enable it if you want to see the exception
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@ -14,23 +14,23 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>

 using Eigen::Tensor;
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {

-  int sizeDim1 = 100;
-  int sizeDim2 = 20;
-  int sizeDim3 = 20;
-  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Eigen::Tensor<DataType, 3, DataLayout> in1(tensorRange);
-  Eigen::Tensor<DataType, 3, DataLayout> in2(tensorRange);
-  Eigen::Tensor<DataType, 3, DataLayout> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 20;
+  IndexType sizeDim3 = 20;
+  Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);

  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
@ -40,17 +40,17 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
  in2 = in2.random() + in2.constant(10.0f);

  // creating TensorMap from tensor
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
  /// c=(a+b)*b
  gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i, j, k),
                         (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
      }
@ -66,8 +66,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_forced_eval_sycl<DataType, RowMajor>(sycl_device);
-  test_forced_eval_sycl<DataType, ColMajor>(sycl_device);
+  test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 void test_cxx11_tensor_forced_eval_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL


@ -28,18 +28,18 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
 {
-  typename Tensor<DataType, 5 ,DataLayout>::Dimensions dim1(2,3,1,7,1);
-  typename Tensor<DataType, 3 ,DataLayout>::Dimensions dim2(2,3,7);
-  typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim3(6,7);
-  typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim4(2,21);
+  typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+  typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);

-  Tensor<DataType, 5, DataLayout> tensor1(dim1);
-  Tensor<DataType, 3, DataLayout> tensor2(dim2);
-  Tensor<DataType, 2, DataLayout> tensor3(dim3);
-  Tensor<DataType, 2, DataLayout> tensor4(dim4);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);

  tensor1.setRandom();

@ -48,10 +48,10 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
  DataType* gpu_data4  = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, dim1);
-  TensorMap<Tensor<DataType, 3,DataLayout>> gpu2(gpu_data2, dim2);
-  TensorMap<Tensor<DataType, 2,DataLayout>> gpu3(gpu_data3, dim3);
-  TensorMap<Tensor<DataType, 2,DataLayout>> gpu4(gpu_data4, dim4);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+  TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);

  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));

@ -63,9 +63,9 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)

  gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
  sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
-  for (int i = 0; i < 2; ++i){
-    for (int j = 0; j < 3; ++j){
-      for (int k = 0; k < 7; ++k){
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      ///ColMajor
        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));    ///ColMajor
@ -86,15 +86,15 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
 }


-template<typename DataType, int DataLayout>
+template<typename DataType, int DataLayout, typename IndexType>
 static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
 {
-  typename Tensor<DataType, 3, DataLayout>::Dimensions dim1(2,3,7);
-  typename Tensor<DataType, 2, DataLayout>::Dimensions dim2(6,7);
-  typename Tensor<DataType, 5, DataLayout>::Dimensions dim3(2,3,1,7,1);
-  Tensor<DataType, 3, DataLayout> tensor(dim1);
-  Tensor<DataType, 2, DataLayout> tensor2d(dim2);
-  Tensor<DataType, 5, DataLayout> tensor5d(dim3);
+  typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+  typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+  typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);

  tensor.setRandom();

@ -102,9 +102,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));

-  TensorMap< Tensor<DataType, 3, DataLayout> > gpu1(gpu_data1, dim1);
-  TensorMap< Tensor<DataType, 2, DataLayout> > gpu2(gpu_data2, dim2);
-  TensorMap< Tensor<DataType, 5, DataLayout> > gpu3(gpu_data3, dim3);
+  TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+  TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+  TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);

  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));

@ -115,9 +115,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
  sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));


-  for (int i = 0; i < 2; ++i){
-    for (int j = 0; j < 3; ++j){
-      for (int k = 0; k < 7; ++k){
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
          VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));    ///ColMajor
@ -134,43 +134,43 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
 }


-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
 {
-  int sizeDim1 = 2;
-  int sizeDim2 = 3;
-  int sizeDim3 = 5;
-  int sizeDim4 = 7;
-  int sizeDim5 = 11;
-  array<int, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
-  Tensor<DataType, 5,DataLayout> tensor(tensorRange);
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
  tensor.setRandom();
-  array<int, 5> slice1_range ={{1, 1, 1, 1, 1}};
-  Tensor<DataType, 5,DataLayout> slice1(slice1_range);
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);

  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu2(gpu_data2, slice1_range);
-  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
-  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));


-  array<int, 5> slice2_range ={{1,1,2,2,3}};
-  Tensor<DataType, 5,DataLayout> slice2(slice2_range);
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu3(gpu_data3, slice2_range);
-  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
-  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 2; ++j) {
-      for (int k = 0; k < 3; ++k) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
      }
    }
@ -219,7 +219,8 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));

-  for(int i=0;i<sizeDim1;i++) for(int j=0;j<sizeDim2;j++){
+  for(IndexType i=0;i<sizeDim1;i++)
+    for(IndexType j=0;j<sizeDim2;j++){
    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
  }
  sycl_device.deallocate(gpu_data1);
@ -230,12 +231,12 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_slice<DataType, RowMajor>(sycl_device);
-  test_simple_slice<DataType, ColMajor>(sycl_device);
-  test_simple_reshape<DataType, RowMajor>(sycl_device);
-  test_simple_reshape<DataType, ColMajor>(sycl_device);
-  test_reshape_as_lvalue<DataType, RowMajor>(sycl_device);
-  test_reshape_as_lvalue<DataType, ColMajor>(sycl_device);
+  test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
 }
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL


@ -69,10 +69,10 @@ static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
  gpu2.device(sycl_device)=gpu1.pad(paddings);
  sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
-  for (int i = 0; i < padedSizeDim1; ++i) {
-    for (int j = 0; j < padedSizeDim2; ++j) {
-      for (int k = 0; k < padedSizeDim3; ++k) {
-        for (int l = 0; l < padedSizeDim4; ++l) {
+  for (IndexType i = 0; i < padedSizeDim1; ++i) {
+    for (IndexType j = 0; j < padedSizeDim2; ++j) {
+      for (IndexType k = 0; k < padedSizeDim3; ++k) {
+        for (IndexType l = 0; l < padedSizeDim4; ++l) {
          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
          } else {
@ -121,10 +121,10 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
  gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
  sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      for (int k = 0; k < 12; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 6; ++j) {
+      for (IndexType k = 0; k < 12; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          const float result_value = DataLayout == ColMajor ?
              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
@ -143,10 +143,6 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_padding<DataType, RowMajor, int>(sycl_device);
-  test_simple_padding<DataType, ColMajor, int>(sycl_device);
-  test_padded_expr<DataType, RowMajor, int>(sycl_device);
-  test_padded_expr<DataType, ColMajor, int>(sycl_device);
  test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
  test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
  test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@ -14,97 +14,129 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>


-template <typename DataType, int DataLayout>
-static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(const Eigen::SyclDevice&  sycl_device) {

-  const int num_rows = 452;
-  const int num_cols = 765;
-  array<int, 2> tensorRange = {{num_rows, num_cols}};
+  const IndexType num_rows = 452;
+  const IndexType num_cols = 765;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};

-  Tensor<DataType, 2, DataLayout> in(tensorRange);
-  Tensor<DataType, 0, DataLayout> full_redux;
-  Tensor<DataType, 0, DataLayout> full_redux_gpu;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;

  in.setRandom();

-  full_redux = in.sum();
+  full_redux = in.mean();

  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));

-  TensorMap<Tensor<DataType, 2, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 0, DataLayout> >  out_gpu(gpu_out_data);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.sum();
+  out_gpu.device(sycl_device) = in_gpu.mean();
  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
  // Check that the CPU and GPU reductions return the same result.
  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
-
  sycl_device.deallocate(gpu_in_data);
  sycl_device.deallocate(gpu_out_data);
 }
-template <typename DataType, int DataLayout>
-static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {

-  int dim_x = 145;
-  int dim_y = 1;
-  int dim_z = 67;

-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
-  red_axis[0] = 0;
-  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(const Eigen::SyclDevice&  sycl_device) {

-  Tensor<DataType, 3, DataLayout> in(tensorRange);
-  Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange);
-  Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange);
+  const IndexType num_rows = 876;
+  const IndexType num_cols = 953;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;

  in.setRandom();

-  redux= in.sum(red_axis);
+  full_redux = in.minimum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) {
+
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux= in.maximum(red_axis);

  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
-  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));

  // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
+  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
+    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));

  sycl_device.deallocate(gpu_in_data);
  sycl_device.deallocate(gpu_out_data);
 }

-template <typename DataType, int DataLayout>
-static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) {

-  int dim_x = 567;
-  int dim_y = 1;
-  int dim_z = 47;
+  IndexType dim_x = 567;
+  IndexType dim_y = 1;
+  IndexType dim_z = 47;

-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
  red_axis[0] = 2;
-  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};

-  Tensor<DataType, 3, DataLayout> in(tensorRange);
-  Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange);
-  Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);

  in.setRandom();

@ -113,15 +145,15 @@ static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device)
  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
  // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
+  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
+    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));

  sycl_device.deallocate(gpu_in_data);
@ -133,12 +165,14 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);

-  test_full_reductions_sycl<DataType, RowMajor>(sycl_device);
-  test_first_dim_reductions_sycl<DataType, RowMajor>(sycl_device);
-  test_last_dim_reductions_sycl<DataType, RowMajor>(sycl_device);
-  test_full_reductions_sycl<DataType, ColMajor>(sycl_device);
-  test_first_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
-  test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 void test_cxx11_tensor_reduction_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@ -0,0 +1,221 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
+
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu(gpu_out_data, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+        }
+      }
+    }
+  }
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue)
+{
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_expected(gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_result(gpu_out_data_result, tensorRange);
+
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+
+  if (LValue) {
+    out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+  } else {
+    out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+  }
+  sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
+
+
+  array<IndexType, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<IndexType, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<IndexType, 4> dst_slice_dim = src_slice_dim;
+  array<IndexType, 4> dst_slice_start = src_slice_start;
+
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+        out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
+          in_gpu.slice(src_slice_start, src_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+  for (IndexType i = 0; i < 5; ++i) {
+     if (LValue) {
+       out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
+           in_gpu.slice(dst_slice_start, dst_slice_dim);
+     } else {
+       out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+           in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+     }
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+
+template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+void test_cxx11_tensor_reverse_sycl() {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
+  }
+}
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL


@ -28,20 +28,20 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;

-template <typename DataType, int DataLayout, typename IndexTypes>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
 {
-  IndexTypes sizeDim1 = 2;
-  IndexTypes sizeDim2 = 3;
-  IndexTypes sizeDim3 = 5;
-  IndexTypes sizeDim4 = 7;
-  array<IndexTypes, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-  Tensor<DataType, 4, DataLayout,IndexTypes> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout,IndexTypes> no_shuffle(tensorRange);
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
  tensor.setRandom();

  const size_t buffSize =tensor.size()*sizeof(DataType);
-  array<IndexTypes, 4> shuffles;
+  array<IndexType, 4> shuffles;
  shuffles[0] = 0;
  shuffles[1] = 1;
  shuffles[2] = 2;
@ -50,8 +50,8 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(buffSize));


-  TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);

  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);

@ -64,10 +64,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
  VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
-        for (int l = 0; l < sizeDim4; ++l) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
        }
      }
@ -78,10 +78,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  shuffles[1] = 3;
  shuffles[2] = 1;
  shuffles[3] = 0;
-  array<IndexTypes, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
-  Tensor<DataType, 4, DataLayout,IndexTypes> shuffle(tensorrangeShuffle);
+  array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  TensorMap<Tensor<DataType, 4,DataLayout,IndexTypes>> gpu3(gpu_data3, tensorrangeShuffle);
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);

  gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
@ -92,10 +92,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
  VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
-        for (int l = 0; l < sizeDim4; ++l) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
        }
      }
@ -107,9 +107,6 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_shuffling_sycl<DataType, RowMajor, int>(sycl_device);
-  test_simple_shuffling_sycl<DataType, ColMajor, int>(sycl_device);
-
  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);

--- a/unsupported/test/cxx11_tensor_striding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  //no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+//  stride = tensor.stride(strides);
+
+  gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (IndexType i = 0; i < 1; ++i) {
+    for (IndexType j = 0; j < 1; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        for (IndexType l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+  //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+//  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+//  result.stride(strides) = tensor;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+//  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+//  result2.stride(strides) = tensor.stride(no_strides);
+
+  gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+  test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+void test_cxx11_tensor_striding_sycl() {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorStridingPerDevice(device));
+  }
+}
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -27,24 +27,24 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
-  int sizeDim1 = 100;
-  int sizeDim2 = 10;
-  int sizeDim3 = 20;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<DataType, 3, DataLayout> in1(tensorRange);
-  Tensor<DataType, 3, DataLayout> out1(tensorRange);
-  Tensor<DataType, 3, DataLayout> out2(tensorRange);
-  Tensor<DataType, 3, DataLayout> out3(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);

  in1 = in1.random();

  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);

  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
@ -55,7 +55,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < in1.size(); ++i) {
+  for (IndexType i = 0; i < in1.size(); ++i) {
    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
@ -65,20 +65,20 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
  sycl_device.deallocate(gpu_data2);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
-  int size = 20;
-  array<int, 1> tensorRange = {{size}};
-  Tensor<DataType, 1, DataLayout> in1(tensorRange);
-  Tensor<DataType, 1, DataLayout> in2(tensorRange);
-  Tensor<DataType, 1, DataLayout> out(tensorRange);
+  IndexType size = 20;
+  array<IndexType, 1> tensorRange = {{size}};
+  Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);

  in1 = in1.random();
  in2 = in1;

  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 1, DataLayout>> gpu1(gpu_data, tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
  sycl_device.synchronize();
  in1.setZero();
@ -86,24 +86,24 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < in1.size(); ++i) {
+  for (IndexType i = 0; i < in1.size(); ++i) {
    VERIFY_IS_APPROX(out(i), in2(i));
  }

  sycl_device.deallocate(gpu_data);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {

-  int sizeDim1 = 100;
-  int sizeDim2 = 10;
-  int sizeDim3 = 20;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<DataType, 3,DataLayout> in1(tensorRange);
-  Tensor<DataType, 3,DataLayout> in2(tensorRange);
-  Tensor<DataType, 3,DataLayout> in3(tensorRange);
-  Tensor<DataType, 3,DataLayout> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);

  in2 = in2.random();
  in3 = in3.random();
@ -113,19 +113,19 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  DataType * gpu_in3_data  = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in3(gpu_in3_data, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);

  /// a=1.2f
  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
      }
    }
@ -137,9 +137,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) * 1.2f);
      }
@ -153,9 +153,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) *
                             in2(i,j,k));
@ -168,9 +168,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) +
                             in2(i,j,k));
@ -183,9 +183,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) *
                             in1(i,j,k));
@ -198,9 +198,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) * 3.14f
                       + in2(i,j,k) * 2.7f);
@ -214,9 +214,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
                                                ? in2(i, j, k)
                                                : in3(i, j, k));
@ -229,15 +229,44 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  sycl_device.deallocate(gpu_in3_data);
  sycl_device.deallocate(gpu_out_data);
 }
+template<typename Scalar1, typename Scalar2,  int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+    IndexType size = 20;
+    array<IndexType, 1> tensorRange = {{size}};
+    Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+    in = in.random();
+
+    Scalar1* gpu_in_data  = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+    Scalar2 * gpu_out_data =  static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+    TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+    TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+    sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+    gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+    out_host = in. template cast<Scalar2>();
+    for(IndexType i=0; i< size; i++)
+    {
+      VERIFY_IS_APPROX(out(i), out_host(i));
+    }
+    printf("cast Test Passed\n");
+    sycl_device.deallocate(gpu_in_data);
+    sycl_device.deallocate(gpu_out_data);
+}
 template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_sycl_mem_transfers<DataType, RowMajor>(sycl_device);
-  test_sycl_computations<DataType, RowMajor>(sycl_device);
-  test_sycl_mem_sync<DataType, RowMajor>(sycl_device);
-  test_sycl_mem_transfers<DataType, ColMajor>(sycl_device);
-  test_sycl_computations<DataType, ColMajor>(sycl_device);
-  test_sycl_mem_sync<DataType, ColMajor>(sycl_device);
+  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
 }

 void test_cxx11_tensor_sycl() {