From b52312068766eff6c96ff6d2534ab240ccb48a76 Mon Sep 17 00:00:00 2001
From: Mehdi Goli <goli.mehdi@gmail.com>
Date: Mon, 16 Jan 2023 07:04:08 +0000
Subject: [PATCH] [SYCL-2020 Support] Enabling Intel DPCPP Compiler support to
 Eigen

---
 CMakeLists.txt                                |  15 +-
 Eigen/Core                                    |   2 +
 Eigen/src/Core/arch/SYCL/PacketMath.h         | 283 ++++--------------
 Eigen/src/Core/arch/SYCL/SyclMemoryModel.h    |  10 +-
 Eigen/src/Core/arch/SYCL/TypeCasting.h        |   2 +-
 bench/tensors/README                          |   6 +
 bench/tensors/tensor_contract_sycl_bench.cc   |  14 +-
 cmake/FindDPCPP.cmake                         |  62 ++++
 test/main.h                                   |  12 +-
 .../Eigen/CXX11/src/Tensor/TensorAssign.h     |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorBase.h       |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   |   4 +-
 .../CXX11/src/Tensor/TensorConcatenation.h    |   4 +-
 .../CXX11/src/Tensor/TensorContraction.h      |   4 +-
 .../CXX11/src/Tensor/TensorContractionSycl.h  |  20 +-
 .../CXX11/src/Tensor/TensorConvolutionSycl.h  |  12 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h |   5 +-
 .../Eigen/CXX11/src/Tensor/TensorEvalTo.h     |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h  |  15 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |   6 +-
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   |  10 +-
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  |   2 +-
 .../CXX11/src/Tensor/TensorReductionSycl.h    |  22 +-
 .../Eigen/CXX11/src/Tensor/TensorReverse.h    |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorScanSycl.h   |  14 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h  |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorStriding.h   |   4 +-
 unsupported/doc/examples/SYCL/CMakeLists.txt  |   7 +-
 unsupported/test/CMakeLists.txt               |   9 +-
 .../test/cxx11_tensor_builtins_sycl.cpp       |  64 +++-
 unsupported/test/cxx11_tensor_device_sycl.cpp |   7 +
 32 files changed, 305 insertions(+), 332 deletions(-)
 create mode 100644 cmake/FindDPCPP.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a57caee72..013d53346 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -505,14 +505,19 @@ endif()
 
 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
-option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
 if(EIGEN_TEST_SYCL)
+  option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON)
+  option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF)
+  option(EIGEN_SYCL_ComputeCpp "Use the DPCPP Sycl implementation." OFF)
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align") 
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable")
   set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
   find_package(Threads REQUIRED)
   if(EIGEN_SYCL_TRISYCL)
     message(STATUS "Using triSYCL")
     include(FindTriSYCL)
-  else()
+  elseif(EIGEN_SYCL_ComputeCpp)
     message(STATUS "Using ComputeCPP SYCL")
     include(FindComputeCpp)
     set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
@@ -523,8 +528,12 @@ if(EIGEN_TEST_SYCL)
       "Use ComputeCpp driver instead of a 2 steps compilation"
       ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
     )
+  else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP)
+    set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Defualt target for Intel CPU/GPU")
+    message(STATUS "Using DPCPP")
+    find_package(DPCPP)
+    add_definitions(-DSYCL_COMPILER_IS_DPCPP)
   endif(EIGEN_SYCL_TRISYCL)
-  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
   if(EIGEN_DONT_VECTORIZE_SYCL)
     message(STATUS "Disabling SYCL vectorization in tests/examples")
     # When disabling SYCL vectorization, also disable Eigen default vectorization
diff --git a/Eigen/Core b/Eigen/Core
index 623d735d6..34d6c4ef5 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -82,7 +82,9 @@
 #include <cstddef>
 #include <cstdlib>
 #include <cmath>
+#ifndef __SYCL_DEVICE_ONLY__
 #include <cassert>
+#endif
 #include <functional>
 #ifndef EIGEN_NO_IO
   #include <sstream>
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index 5bc323589..062d50efb 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -21,193 +21,53 @@
 #ifndef EIGEN_PACKET_MATH_SYCL_H
 #define EIGEN_PACKET_MATH_SYCL_H
 #include <type_traits>
+
 #include "../../InternalHeaderCheck.h"
 
 namespace Eigen {
 
 namespace internal {
 #ifdef SYCL_DEVICE_ONLY
-
-#define SYCL_PLOADT_RO(address_space_target)                                 \
-  template <typename packet_type, int Alignment>                             \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro(               \
-      typename cl::sycl::multi_ptr<                                          \
-          const typename unpacket_traits<packet_type>::type,                 \
-          cl::sycl::access::address_space::address_space_target>::pointer_t  \
-          from) {                                                            \
-    typedef typename unpacket_traits<packet_type>::type scalar;              \
-    typedef cl::sycl::multi_ptr<                                             \
-        scalar, cl::sycl::access::address_space::address_space_target>       \
-        multi_ptr;                                                           \
-    auto res = packet_type(                                                  \
-        static_cast<typename unpacket_traits<packet_type>::type>(0));        \
-    res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
-    return res;                                                              \
+#define SYCL_PLOAD(packet_type, AlignedType)                          \
+  template <>                                                         \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                   \
+      pload##AlignedType<packet_type>(                                \
+          const typename unpacket_traits<packet_type>::type* from) {  \
+    using scalar = typename unpacket_traits<packet_type>::type;       \
+    typedef cl::sycl::multi_ptr<                                      \
+        const scalar, cl::sycl::access::address_space::generic_space, \
+        cl::sycl::access::decorated::no>                              \
+        multi_ptr;                                                    \
+    packet_type res{};                                                \
+    res.load(0, multi_ptr(from));                                     \
+    return res;                                                       \
   }
 
-SYCL_PLOADT_RO(global_space)
-SYCL_PLOADT_RO(local_space)
-#undef SYCL_PLOADT_RO
-#endif
-
-template <typename packet_type, int Alignment, typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
-          cl::sycl::access::mode::read_write, T>& from) {
-  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
-}
-
-#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)            \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType(     \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return ploadt_ro<packet_type, Alignment>(from);                         \
-  }
-
-// global space
-SYCL_PLOAD(global_space, Unaligned, u)
-SYCL_PLOAD(global_space, Aligned, )
-// local space
-SYCL_PLOAD(local_space, Unaligned, u)
-SYCL_PLOAD(local_space, Aligned, )
+SYCL_PLOAD(cl::sycl::cl_float4, u)
+SYCL_PLOAD(cl::sycl::cl_float4, )
+SYCL_PLOAD(cl::sycl::cl_double2, u)
+SYCL_PLOAD(cl::sycl::cl_double2, )
 
 #undef SYCL_PLOAD
-#endif
 
-#define SYCL_PLOAD(Alignment, AlignedType)                              \
-  template <typename packet_type>                                       \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
-      const Eigen::TensorSycl::internal::RangeAccess<                   \
-          cl::sycl::access::mode::read_write,                           \
-          typename unpacket_traits<packet_type>::type>                  \
-          from) {                                                       \
-    return ploadt_ro<packet_type, Alignment>(from);                     \
-  }
-SYCL_PLOAD(Unaligned, u)
-SYCL_PLOAD(Aligned, )
-#undef SYCL_PLOAD
-
-#ifdef SYCL_DEVICE_ONLY
-/** \internal \returns a packet version of \a *from.
- * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
-#define SYCL_PLOADT(address_space_target)                                   \
-  template <typename packet_type, int Alignment>                            \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(                 \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    if (Alignment >= unpacket_traits<packet_type>::alignment)               \
-      return pload<packet_type>(from);                                      \
-    else                                                                    \
-      return ploadu<packet_type>(from);                                     \
+#define SYCL_PSTORE(scalar, packet_type, alignment)             \
+  template <>                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
+      scalar* to, const packet_type& from) {                    \
+    typedef cl::sycl::multi_ptr<                                \
+        scalar, cl::sycl::access::address_space::generic_space, \
+        cl::sycl::access::decorated::no>                        \
+        multi_ptr;                                              \
+    from.store(0, multi_ptr(to));                               \
   }
 
-// global space
-SYCL_PLOADT(global_space)
-// local space
-SYCL_PLOADT(local_space)
-#undef SYCL_PLOADT
-#endif
+SYCL_PSTORE(float, cl::sycl::cl_float4, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, u)
 
-template <typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt(const Eigen::TensorSycl::internal::RangeAccess<
-       cl::sycl::access::mode::read_write,
-       typename unpacket_traits<packet_type>::type>& from) {
-  return ploadt<packet_type, Alignment>(from.get_pointer());
-}
-#ifdef SYCL_DEVICE_ONLY
-
-// private_space
-#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)                 \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                    \
-  ploadt_ro<packet_type, Alignment>(                                   \
-      const typename unpacket_traits<packet_type>::type* from) {       \
-    typedef typename unpacket_traits<packet_type>::type scalar;        \
-    auto res = packet_type(static_cast<scalar>(0));                    \
-    res.template load<cl::sycl::access::address_space::private_space>( \
-        0, const_cast<scalar*>(from));                                 \
-    return res;                                                        \
-  }
-
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
-
-#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)                    \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
-      const typename unpacket_traits<packet_type>::type* from) {           \
-    typedef typename unpacket_traits<packet_type>::type scalar;            \
-    auto res = packet_type(static_cast<scalar>(0));                        \
-    res.template load<cl::sycl::access::address_space::private_space>(     \
-        0, const_cast<scalar*>(from));                                     \
-    return res;                                                            \
-  }
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
-
-#undef SYCL_PLOAD_SPECIAL
-
-#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)   \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(             \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    typedef cl::sycl::multi_ptr<                                            \
-        scalar, cl::sycl::access::address_space::address_space_target>      \
-        multi_ptr;                                                          \
-    from.store(0, multi_ptr(to));                                           \
-  }
-
-// global space
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
-
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
 #undef SYCL_PSTORE
 
-#define SYCL_PSTORE_T(address_space_target)                                 \
-  template <typename scalar, typename packet_type, int Alignment>           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(                       \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    if (Alignment)                                                          \
-      pstore(to, from);                                                     \
-    else                                                                    \
-      pstoreu(to, from);                                                    \
-  }
-
-SYCL_PSTORE_T(global_space)
-
-SYCL_PSTORE_T(local_space)
-
-#undef SYCL_PSTORE_T
-
 #define SYCL_PSET1(packet_type)                                         \
   template <>                                                           \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@@ -291,22 +151,6 @@ struct get_base_packet<cl::sycl::cl_double2> {
   }
 };
 
-#define SYCL_PLOAD_DUP(address_space_target)                                \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup(               \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return get_base_packet<packet_type>::get_ploaddup(from);                \
-  }
-
-// global space
-SYCL_PLOAD_DUP(global_space)
-// local_space
-SYCL_PLOAD_DUP(local_space)
-#undef SYCL_PLOAD_DUP
-
 #define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
   template <>                                                              \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
@@ -325,30 +169,11 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
       const typename unpacket_traits<packet_type>::type& a) {           \
     return get_base_packet<packet_type>::set_plset(a);                  \
   }
-
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
 
 #undef SYCL_PLSET
 
-#define SYCL_PGATHER(address_space_target)                                  \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline packet_type pgather(                             \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from,                                                             \
-      Index stride) {                                                       \
-    return get_base_packet<packet_type>::get_pgather(from, stride);         \
-  }
-
-// global space
-SYCL_PGATHER(global_space)
-// local space
-SYCL_PGATHER(local_space)
-
-#undef SYCL_PGATHER
-
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
   template <>                                                                  \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
@@ -362,24 +187,6 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
 
 #undef SYCL_PGATHER_SPECILIZE
 
-#define SYCL_PSCATTER(address_space_target)                                 \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline void pscatter(                                   \
-      typename cl::sycl::multi_ptr<                                         \
-          typename unpacket_traits<packet_type>::type,                      \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from, Index stride) {                              \
-    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
-  }
-
-// global space
-SYCL_PSCATTER(global_space)
-// local space
-SYCL_PSCATTER(local_space)
-
-#undef SYCL_PSCATTER
-
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
   template <>                                                               \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
@@ -563,6 +370,34 @@ inline cl::sycl::cl_double2 pblend(
 }
 #endif  // SYCL_DEVICE_ONLY
 
+template <typename packet_type, int Alignment, typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
+          cl::sycl::access::mode::read_write, T>& from) {
+  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
+}
+
+#define SYCL_PLOAD(Alignment, AlignedType)                              \
+  template <typename packet_type>                                       \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
+      const Eigen::TensorSycl::internal::RangeAccess<                   \
+          cl::sycl::access::mode::read_write,                           \
+          typename unpacket_traits<packet_type>::type>                  \
+          from) {                                                       \
+    return ploadt_ro<packet_type, Alignment>(from);                     \
+  }
+SYCL_PLOAD(Unaligned, u)
+SYCL_PLOAD(Aligned, )
+#undef SYCL_PLOAD
+
+template <typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt(const Eigen::TensorSycl::internal::RangeAccess<
+       cl::sycl::access::mode::read_write,
+       typename unpacket_traits<packet_type>::type>& from) {
+  return ploadt<packet_type, Alignment>(from.get_pointer());
+}
+
 #define SYCL_PSTORE(alignment)                                  \
   template <typename packet_type>                               \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
diff --git a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
index 54eedfaa0..c532c1867 100644
--- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
+++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
@@ -223,7 +223,6 @@ class PointerMapper {
         m_pointerMap.clear();
         EIGEN_THROW_X(
             std::out_of_range("The pointer is not registered in the map\n"));
-
       }
       --node;
     }
@@ -550,7 +549,7 @@ struct RangeAccess {
   static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
   typedef T scalar_t;
   typedef scalar_t &ref_t;
-  typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
+  typedef scalar_t *ptr_t;
 
   // the accessor type does not necessarily the same as T
   typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
@@ -570,7 +569,12 @@ struct RangeAccess {
   RangeAccess(std::nullptr_t) : RangeAccess() {}
   // This template parameter must be removed and scalar_t should be replaced
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
-    return (access_.get_pointer().get() + offset_);
+    typedef cl::sycl::multi_ptr<scalar_t,
+                                cl::sycl::access::address_space::generic_space,
+                                cl::sycl::access::decorated::no>
+        multi_ptr;
+    multi_ptr p(access_);
+    return (p + offset_).get_raw();
   }
   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
index 613e82331..f6f057b32 100644
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -64,7 +64,7 @@ pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
                                cl::sycl::rounding_mode::automatic>();
   auto b1 = b.template convert<cl::sycl::cl_float,
                                cl::sycl::rounding_mode::automatic>();
-  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+  return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
 }
 
 template <>
diff --git a/bench/tensors/README b/bench/tensors/README
index dcbf0217a..acce5a12b 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -16,5 +16,11 @@ following commands:
 1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
 2. bash eigen_sycl_bench.sh
 
+To compile the floating point GPU benchmarks using Intel DPCPP compiler 
+/path/to/dpcpp/bin/clang+  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda"  -std=c++17  tensor_benchmarks_sycl.cc benchmark_main.cc  -lpthread -o eigen_dpcpp_sycl
+
 Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the contraction with DPCPP: 
+/path/to/dpcpp/bin/clang++  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17   tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc
index c2d098ecc..b1d89d204 100644
--- a/bench/tensors/tensor_contract_sycl_bench.cc
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@@ -15,7 +15,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#include <SYCL/sycl.hpp>
+#include <CL/sycl.hpp>
 #include <fstream>
 #include <iostream>
 #include <chrono>
@@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
 
   // Initialize the content of the memory pools to prevent asan from
   // complaining.
-  device_.fill(a_, m_ * k_, T(12));
-  device_.fill(b_, k_ * n_, T(23));
-  device_.fill(c_, m_ * n_, T(31));
+  device_.fill(a_, a_ + (m_ * k_), T(12));
+  device_.fill(b_, b_ + (k_ * n_), T(23));
+  device_.fill(c_, c_ + (m_ * n_), T(31));
 
   Eigen::array<TensorIndex, 2> sizeA;
   sizeA[0] = m_;
@@ -110,9 +110,9 @@ void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorInd
 
   // Initialize the content of the memory pools to prevent asan from
   // complaining.
-  device_.memset(a_, 12, m_ * k_ * sizeof(T));
-  device_.memset(b_, 23, k_ * n_ * sizeof(T));
-  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
+  device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
+  device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));
 
   Eigen::array<TensorIndex, 2> sizeA;
   sizeA[0] = m_;
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
new file mode 100644
index 000000000..73aa30f65
--- /dev/null
+++ b/cmake/FindDPCPP.cmake
@@ -0,0 +1,62 @@
+include_guard()
+
+include(CheckCXXCompilerFlag)
+include(FindPackageHandleStandardArgs)
+
+if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
+   "${DPCPP_SYCL_ARCH}" STREQUAL "")
+   message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
+                       " please specify in DPCPP_SYCL_ARCH")
+endif()
+
+set(DPCPP_USER_FLAGS "" CACHE STRING 
+    "Additional user-specified compiler flags for DPC++")
+
+get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
+
+add_library(DPCPP::DPCPP INTERFACE IMPORTED)
+
+set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
+if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
+  if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
+    list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+    list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
+  elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+    list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+    list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
+  endif()
+endif()
+
+if(UNIX)
+  set_target_properties(DPCPP::DPCPP PROPERTIES
+    INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+    INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
+    message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
+else()
+  set_target_properties(DPCPP::DPCPP PROPERTIES
+    INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+    INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
+endif()
+
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(SB_ADD_SYCL
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+  target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
+  target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
+  get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
+  if (NOT target_type STREQUAL "OBJECT_LIBRARY")
+    target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+  endif()
+endfunction()
diff --git a/test/main.h b/test/main.h
index a52da9eec..853858173 100644
--- a/test/main.h
+++ b/test/main.h
@@ -18,6 +18,9 @@
 #include <vector>
 #include <typeinfo>
 #include <functional>
+#ifdef EIGEN_USE_SYCL
+#include <CL/sycl.hpp>
+#endif
 
 // The following includes of STL headers have to be done _before_ the
 // definition of macros min() and max().  The reason is that many STL
@@ -121,9 +124,7 @@ struct imag {};
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
 #define B0 FORBIDDEN_IDENTIFIER
-// `I` may be defined by complex.h:
 #define I  FORBIDDEN_IDENTIFIER
-
 // Unit tests calling Eigen's blas library must preserve the default blocking size
 // to avoid troubles.
 #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
@@ -301,15 +302,16 @@ namespace Eigen
       }
     #endif //EIGEN_EXCEPTIONS
 
-  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
     #define eigen_assert(a) \
       if( (!(a)) && (!no_more_assert) )       \
       {                                       \
         Eigen::no_more_assert = true;         \
-        if(report_on_cerr_on_assert_failure)  \
+        if(report_on_cerr_on_assert_failure) { \
           eigen_plain_assert(a);              \
-        else                                  \
+        } else {                                  \
           EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
+        } \
       }
 
     #ifdef EIGEN_EXCEPTIONS
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 0bd3a001b..3365b72dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -170,10 +170,10 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_rightImpl.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
 
     const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
     const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index a4ac2ad6d..e3fc8f4fc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -1185,7 +1185,6 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return derived();
     }
-
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
@@ -1195,3 +1194,4 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index ef01e306d..e5f3d5eea 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -438,13 +438,13 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     : Base(op, device)
     { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     if (this->isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 073be8146..dea41dfd2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -331,7 +331,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     // Collect dimension-wise indices (subs).
     array<Index, Base::NumDims> subs;
@@ -360,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index c629e44cf..3a917a067 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -362,10 +362,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
     const OutputKernelType m_output_kernel;
 };
 
-
 template<typename Derived>
-struct TensorContractionEvaluatorBase : internal::no_assignment_operator
-{
+struct TensorContractionEvaluatorBase {
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index c6c40776f..526fc816c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -597,7 +597,7 @@ class TensorContractionKernel {
                                                                 const TripleDim triple_dim_)
       : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
     const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
     const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
@@ -636,7 +636,7 @@ class TensorContractionKernel {
   // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
   // it only compute the block on each thread's private memory space
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
-                                                                    PacketReturnType *privateRes) {
+                                                                    PacketReturnType *privateRes) const {
     StorageIndex idx = 0;
     EIGEN_CONSTEXPR StorageIndex lhs_stride =
         contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
@@ -661,7 +661,7 @@ class TensorContractionKernel {
   // class.
   template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
-                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
     auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
       return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
     };
@@ -713,7 +713,7 @@ class TensorContractionKernel {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
       std::enable_if_t<contract_tp == contraction_type::no_local>
       extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) const {
     EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
         InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
     EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
@@ -833,7 +833,8 @@ class TensorContractionKernel {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
                                                                     ThreadProperties<StorageIndex> &thread_properties,
                                                                     TiledMemory &tiled_input_block,
-                                                                    PacketReturnType *privateRes, bool &db_offset) {
+                                                                    PacketReturnType *privateRes, bool &db_offset) const {
+
     // Tiling the Rhs block from global to local memory
     extract_block<RHSBlockProperties, is_internal_block>(
         rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
@@ -871,7 +872,7 @@ class TensorContractionKernel {
   template <bool is_internal_block, typename OutPtr>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
                                                            ThreadProperties<StorageIndex> &thread_properties,
-                                                           OutPtr out_ptr) {
+                                                           OutPtr out_ptr) const {
     auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
     // Allocate register space
     PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
@@ -897,7 +898,7 @@ class TensorContractionKernel {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
       std::enable_if_t<contract_tp == contraction_type::local>
       extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) const {
     EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
         InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
     EIGEN_CONSTEXPR StorageIndex LoadPerThread =
@@ -1035,7 +1036,7 @@ struct GeneralVectorTensor {
         nonContractDim(nonContractDim_),
         contractDim(contractDim_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     auto scratch_ptr = scratch.get_pointer();
     const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
     StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
@@ -1252,7 +1253,8 @@ struct GeneralScalarContraction {
                            const StorageIndex rng_)
       : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
 
-  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
+
     auto out_ptr = out_res.get_pointer();
     auto scratch_ptr = scratch.get_pointer().get();
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 3cbc1ab60..7ccb17423 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -57,10 +57,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
         input_range(input_range_) {}
 
   template <typename BooleanDim2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
     return (boolean_check[0] && boolean_check[1]);
   }
-  void operator()(cl::sycl::nd_item<2> itemID) {
+  void operator()(cl::sycl::nd_item<2> itemID)  const {
     auto buffer_ptr = buffer_acc.get_pointer();
     auto kernel_ptr = kernel_filter.get_pointer();
     // the required row to be calculated for the for each plane in shered memory
@@ -123,11 +123,11 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
         kernel_size(kernel_size_),
         input_range(input_range_) {}
   template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
   }
 
-  void operator()(cl::sycl::nd_item<3> itemID) {
+  void operator()(cl::sycl::nd_item<3> itemID) const {
     auto buffer_ptr = buffer_acc.get_pointer();
     auto kernel_ptr = kernel_filter.get_pointer();
     // the required row to be calculated for the for each plane in shered memory
@@ -212,10 +212,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
         input_range(input_range_),
         numP(numP_) {}
   template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
   }
-  void operator()(cl::sycl::nd_item<3> itemID) {
+  void operator()(cl::sycl::nd_item<3> itemID) const {
     auto buffer_ptr = buffer_acc.get_pointer();
     auto kernel_ptr = kernel_filter.get_pointer();
     const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 84ebe38e8..8fdc8ba5f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -31,8 +31,7 @@ struct SyclDeviceInfo {
                 .template get_info<cl::sycl::info::device::local_mem_type>()),
         max_work_item_sizes(
             queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_item_sizes>()),
+                .template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
         max_mem_alloc_size(
             queue.get_device()
                 .template get_info<
@@ -356,7 +355,7 @@ class QueueInterface {
       return;
     }
     const ptrdiff_t count = end - begin;
-    auto f = [&](cl::sycl::handler &cgh) {
+    auto f = [&](cl::sycl::handler &cgh)  {
       auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
       cgh.fill(dst_acc, value);
     };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 0e9cdfe41..7e13c69e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -159,10 +159,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
     m_buffer[i] = m_impl.coeff(i);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 92ad0f545..2bd94c308 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -98,7 +98,7 @@ struct TensorEvaluator
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
     eigen_assert(m_data != NULL);
     return m_data[index];
   }
@@ -122,7 +122,7 @@ struct TensorEvaluator
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
   }
@@ -137,7 +137,7 @@ struct TensorEvaluator
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
-  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+  coeffRef(const array<DenseIndex, NumCoords>& coords) const {
     eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
@@ -978,7 +978,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   TensorEvaluator<ElseArgType, Device> m_elseImpl;
 };
 
-
 } // end namespace Eigen
 
+#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
+template <typename Derived, typename Device>
+struct cl::sycl::is_device_copyable<
+    Eigen::TensorEvaluator<Derived, Device>,
+    std::enable_if_t<!std::is_trivially_copyable<
+        Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
+#endif
+
 #endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index de9bed4ed..f961b4066 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -688,12 +688,12 @@ struct ExecExprFunctorKernel {
       : evaluator(evaluator_), range(range_) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
-      cl::sycl::nd_item<1> itemID) {
+      cl::sycl::nd_item<1> itemID) const {
     compute(itemID);
   }
   template <bool is_vec = Evaluator::PacketAccess>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
-  compute(const cl::sycl::nd_item<1>& itemID) {
+  compute(const cl::sycl::nd_item<1>& itemID) const {
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
     Index total_threads = itemID.get_global_range(0);
 
@@ -703,7 +703,7 @@ struct ExecExprFunctorKernel {
   }
   template <bool is_vec = Evaluator::PacketAccess>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
-  compute(const cl::sycl::nd_item<1>& itemID) {
+  compute(const cl::sycl::nd_item<1>& itemID) const {
     const Index vectorizedRange =
         (range / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index d12bd6dd5..8b1408599 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -202,12 +202,12 @@ template<typename ArgType, typename Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(index);
   }
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 020571040..b696190b2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -267,13 +267,13 @@ template<typename NewDimensions, typename ArgType, typename Device>
       TensorBlockDesc;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(index);
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
@@ -733,7 +733,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     : Base(op, device)
     { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     if (this->m_is_identity) {
       return this->m_impl.coeffRef(index);
@@ -743,7 +743,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     if (this->m_is_identity) {
       this->m_impl.template writePacket<StoreMode>(index, x);
@@ -1085,7 +1085,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     if (this->m_is_identity) {
       return this->m_impl.coeffRef(index);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e62397a62..ae03ba52d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -895,7 +895,7 @@ static constexpr bool RunningOnGPU = false;
   // binding placeholder accessors to a command group handler for SYCL
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
     m_impl.bind(cgh);
-    m_result.bind(cgh);
+    if(m_result) m_result.bind(cgh);
   }
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 397870fcf..715797d62 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -87,7 +87,7 @@ struct SecondStepFullReducer {
   SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
       : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
 
-  void operator()(cl::sycl::nd_item<1> itemID) {
+  void operator()(cl::sycl::nd_item<1> itemID) const {
     // Our empirical research shows that the best performance will be achieved
     // when there is only one element per thread to reduce in the second step.
     // in this step the second step reduction time is almost negligible.
@@ -141,11 +141,11 @@ class FullReductionKernelFunctor {
                              Index rng_, OpType op_)
       : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
 
-  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
 
   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
+      const cl::sycl::nd_item<1> &itemID) const {
     auto output_ptr = final_output.get_pointer();
     Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index globalid = itemID.get_global_id(0);
@@ -184,7 +184,7 @@ class FullReductionKernelFunctor {
 
   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
+      const cl::sycl::nd_item<1> &itemID) const {
     auto output_ptr = final_output.get_pointer();
     Index globalid = itemID.get_global_id(0);
     Index localid = itemID.get_local_id(0);
@@ -228,14 +228,16 @@ class GenericNondeterministicReducer {
         range(range_),
         num_values_to_reduce(num_values_to_reduce_) {}
 
-  void operator()(cl::sycl::nd_item<1> itemID) {
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    //This is to bypass the statefull condition in Eigen meanReducer
+    Op non_const_functor;
+    std::memcpy(&non_const_functor, &functor, sizeof (Op));
     auto output_accessor_ptr = output_accessor.get_pointer();
-    /// const cast added as a naive solution to solve the qualifier drop error
     Index globalid = static_cast<Index>(itemID.get_global_linear_id());
     if (globalid < range) {
       CoeffReturnType accum = functor.initialize();
       Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
-          evaluator, evaluator.firstInput(globalid), functor, &accum);
+          evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
       output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
     }
   }
@@ -281,7 +283,7 @@ struct PartialReductionKernel {
         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
-                                                                 CoeffReturnType &accumulator) {
+                                                                 CoeffReturnType &accumulator) const {
     if (globalPId >= num_coeffs_to_preserve) {
       return;
     }
@@ -298,7 +300,7 @@ struct PartialReductionKernel {
       global_offset += per_thread_global_stride;
     }
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const Index linearLocalThreadId = itemID.get_local_id(0);
     Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
                                                            : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
@@ -380,7 +382,7 @@ struct SecondStepPartialReduction {
         num_coeffs_to_preserve(num_coeffs_to_preserve_),
         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     const Index globalId = itemID.get_global_id(0);
 
     if (globalId >= num_coeffs_to_preserve) return;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index b5e66b3b8..342d9e9b5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -441,12 +441,12 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
     return this->m_impl.coeffRef(this->reverseIndex(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x) {
+  void writePacket(Index index, const PacketReturnType& x) const {
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // This code is pilfered from TensorMorphing.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
index 636fb7d71..ecc872b3a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -109,28 +109,28 @@ struct ScanKernelFunctor {
   template <scan_step sst = stp, typename Input>
   std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
       EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
+      read(const Input &inpt, Index global_id) const {
     return inpt.coeff(global_id);
   }
 
   template <scan_step sst = stp, typename Input>
   std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
       EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
+      read(const Input &inpt, Index global_id) const {
     return inpt[global_id];
   }
 
   template <scan_step sst = stp, typename InclusiveOp>
   std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
     inclusive_op();
   }
 
   template <scan_step sst = stp, typename InclusiveOp>
   std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp) {}
+  first_step_inclusive_Operation(InclusiveOp) const {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     auto out_ptr = out_accessor.get_pointer();
     auto tmp_ptr = temp_accessor.get_pointer();
     auto scratch_ptr = scratch.get_pointer().get();
@@ -307,7 +307,7 @@ struct ScanAdjustmentKernelFunctor {
         scanParameters(scanParameters_),
         accumulator(accumulator_) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
     auto in_ptr = in_accessor.get_pointer();
     auto out_ptr = out_accessor.get_pointer();
 
@@ -473,7 +473,7 @@ struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
   typedef typename Self::CoeffReturnType CoeffReturnType;
   typedef typename Self::Storage Storage;
   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  void operator()(Self &self, EvaluatorPointerType data) {
+  void operator()(Self &self, EvaluatorPointerType data) const {
     const Index total_size = internal::array_prod(self.dimensions());
     const Index scan_size = self.size();
     const Index scan_stride = self.stride();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 977263aa5..3d97e6684 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -390,13 +390,13 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
       : Base(op, device)
   { }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 609afe3b9..681c4e5df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -288,13 +288,13 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
index 8ee96f51b..4fe94c6ec 100644
--- a/unsupported/doc/examples/SYCL/CMakeLists.txt
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@@ -2,9 +2,8 @@ FILE(GLOB examples_SRCS "*.cpp")
 
 set(EIGEN_SYCL ON)
 list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
-if(EIGEN_SYCL_TRISYCL)
-  set(CMAKE_CXX_STANDARD 17)
-else(EIGEN_SYCL_TRISYCL)
+set(CMAKE_CXX_STANDARD 17)
+if(EIGEN_SYCL_ComputeCpp)
   if(MSVC)
     list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
   else()
@@ -22,7 +21,7 @@ else(EIGEN_SYCL_TRISYCL)
       -no-serial-memop
       -Xclang
       -cl-mad-enable)
-endif(EIGEN_SYCL_TRISYCL)
+endif(EIGEN_SYCL_ComputeCpp)
 
 FOREACH(example_src ${examples_SRCS})
   GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 21d8c5eb5..45cc4a3d6 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -122,6 +122,7 @@ ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
 
 if(EIGEN_TEST_SYCL)
   set(EIGEN_SYCL ON)
+  set(CMAKE_CXX_STANDARD 17)
   # Forward CMake options as preprocessor definitions
   if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
     add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
@@ -172,10 +173,7 @@ if(EIGEN_TEST_SYCL)
     add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
   endif()
 
-  if(EIGEN_SYCL_TRISYCL)
-    # triSYCL now requires c++17.
-    set(CMAKE_CXX_STANDARD 17)
-  else()
+  if(EIGEN_SYCL_ComputeCpp)
     if(MSVC)
       list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
     else()
@@ -193,7 +191,7 @@ if(EIGEN_TEST_SYCL)
         -no-serial-memop
         -Xclang
         -cl-mad-enable)
-  endif()
+  endif(EIGEN_SYCL_ComputeCpp)
 
   ei_add_test(cxx11_tensor_sycl)
   ei_add_test(cxx11_tensor_image_op_sycl)
@@ -409,4 +407,3 @@ if (EIGEN_TEST_HIP)
   endif()
 
 endif()
-
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index 27a82540f..be7afd200 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -27,17 +27,64 @@ using Eigen::TensorMap;
 
 // Functions used to compare the TensorMap implementation on the device with
 // the equivalent on the host
-namespace cl {
-namespace sycl {
-template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+namespace SYCL {
+template <typename T> T abs(T x) { 
+  return cl::sycl::abs(x);
+}
+
+template <> float abs(float x) { 
+  return cl::sycl::fabs(x);
+}
+
+template <> double abs(double x) { 
+  return cl::sycl::fabs(x);
+}
+
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+template <typename T> T cwiseMax(T x, T y) { 
+return cl::sycl::max(x, y);
+}
+template <typename T> T cwiseMin(T x, T y) { 
+  return cl::sycl::min(x, y); 
 }
 }
 
+
+#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC)                        \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x)  {                                 \
+      return SYCL::FUNC(x);                                        \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) {                       \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT_NON_SYCL(abs)
+DECLARE_UNARY_STRUCT_NON_SYCL(square)
+DECLARE_UNARY_STRUCT_NON_SYCL(cube)
+DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
+
+#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC)                                                 \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y){                                               \
+      return SYCL::FUNC(x, y);                                                               \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) {                        \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
+
+
 struct EqualAssignment {
   template <typename Lhs, typename Rhs>
   void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
@@ -119,12 +166,9 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
     }                                                              \
   };
 
-DECLARE_UNARY_STRUCT(abs)
+
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
-DECLARE_UNARY_STRUCT(square)
-DECLARE_UNARY_STRUCT(cube)
-DECLARE_UNARY_STRUCT(inverse)
 DECLARE_UNARY_STRUCT(tanh)
 DECLARE_UNARY_STRUCT(exp)
 DECLARE_UNARY_STRUCT(expm1)
@@ -288,8 +332,6 @@ void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
     }                                                                                        \
   };
 
-DECLARE_BINARY_STRUCT(cwiseMax)
-DECLARE_BINARY_STRUCT(cwiseMin)
 
 #define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
   struct op_##NAME {                                                      \
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index d7ff38d34..74e902645 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -23,6 +23,13 @@
 #include <stdint.h>
 #include <iostream>
 
+#ifdef SYCL_COMPILER_IS_DPCPP
+template <typename T>
+struct cl::sycl::is_device_copyable<
+    const OffByOneScalar<T>,
+    std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
+#endif
+
 template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
   IndexType sizeDim1 = 100;