[SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen

2025-07-04 04:05:19 +08:00 · 2023-01-16 07:04:08 +00:00 · 2023-01-16 07:04:08 +00:00 · b523120687
commit b523120687
parent bae119bb7e
32 changed files with 305 additions and 332 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -505,14 +505,19 @@ endif()

 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
-option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
 if(EIGEN_TEST_SYCL)
+  option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON)
+  option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF)
+  option(EIGEN_SYCL_ComputeCpp "Use the DPCPP Sycl implementation." OFF)
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align") 
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable")
  set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
  find_package(Threads REQUIRED)
  if(EIGEN_SYCL_TRISYCL)
    message(STATUS "Using triSYCL")
    include(FindTriSYCL)
-  else()
+  elseif(EIGEN_SYCL_ComputeCpp)
    message(STATUS "Using ComputeCPP SYCL")
    include(FindComputeCpp)
    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
@ -523,8 +528,12 @@ if(EIGEN_TEST_SYCL)
      "Use ComputeCpp driver instead of a 2 steps compilation"
      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
    )
+  else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP)
+    set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Defualt target for Intel CPU/GPU")
+    message(STATUS "Using DPCPP")
+    find_package(DPCPP)
+    add_definitions(-DSYCL_COMPILER_IS_DPCPP)
  endif(EIGEN_SYCL_TRISYCL)
-  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
  if(EIGEN_DONT_VECTORIZE_SYCL)
    message(STATUS "Disabling SYCL vectorization in tests/examples")
    # When disabling SYCL vectorization, also disable Eigen default vectorization
--- a/Eigen/Core
+++ b/Eigen/Core
@ -82,7 +82,9 @@
 #include <cstddef>
 #include <cstdlib>
 #include <cmath>
+#ifndef __SYCL_DEVICE_ONLY__
 #include <cassert>
+#endif
 #include <functional>
 #ifndef EIGEN_NO_IO
  #include <sstream>
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@ -21,193 +21,53 @@
 #ifndef EIGEN_PACKET_MATH_SYCL_H
 #define EIGEN_PACKET_MATH_SYCL_H
 #include <type_traits>
+
 #include "../../InternalHeaderCheck.h"

 namespace Eigen {

 namespace internal {
 #ifdef SYCL_DEVICE_ONLY
-
-#define SYCL_PLOADT_RO(address_space_target)                                 \
-  template <typename packet_type, int Alignment>                             \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro(               \
-      typename cl::sycl::multi_ptr<                                          \
-          const typename unpacket_traits<packet_type>::type,                 \
-          cl::sycl::access::address_space::address_space_target>::pointer_t  \
-          from) {                                                            \
-    typedef typename unpacket_traits<packet_type>::type scalar;              \
-    typedef cl::sycl::multi_ptr<                                             \
-        scalar, cl::sycl::access::address_space::address_space_target>       \
-        multi_ptr;                                                           \
-    auto res = packet_type(                                                  \
-        static_cast<typename unpacket_traits<packet_type>::type>(0));        \
-    res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
-    return res;                                                              \
-  }
-
-SYCL_PLOADT_RO(global_space)
-SYCL_PLOADT_RO(local_space)
-#undef SYCL_PLOADT_RO
-#endif
-
-template <typename packet_type, int Alignment, typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
-          cl::sycl::access::mode::read_write, T>& from) {
-  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
-}
-
-#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)            \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType(     \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return ploadt_ro<packet_type, Alignment>(from);                         \
-  }
-
-// global space
-SYCL_PLOAD(global_space, Unaligned, u)
-SYCL_PLOAD(global_space, Aligned, )
-// local space
-SYCL_PLOAD(local_space, Unaligned, u)
-SYCL_PLOAD(local_space, Aligned, )
-
-#undef SYCL_PLOAD
-#endif
-
-#define SYCL_PLOAD(Alignment, AlignedType)                              \
-  template <typename packet_type>                                       \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
-      const Eigen::TensorSycl::internal::RangeAccess<                   \
-          cl::sycl::access::mode::read_write,                           \
-          typename unpacket_traits<packet_type>::type>                  \
-          from) {                                                       \
-    return ploadt_ro<packet_type, Alignment>(from);                     \
-  }
-SYCL_PLOAD(Unaligned, u)
-SYCL_PLOAD(Aligned, )
-#undef SYCL_PLOAD
-
-#ifdef SYCL_DEVICE_ONLY
-/** \internal \returns a packet version of \a *from.
- * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
-#define SYCL_PLOADT(address_space_target)                                   \
-  template <typename packet_type, int Alignment>                            \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(                 \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    if (Alignment >= unpacket_traits<packet_type>::alignment)               \
-      return pload<packet_type>(from);                                      \
-    else                                                                    \
-      return ploadu<packet_type>(from);                                     \
-  }
-
-// global space
-SYCL_PLOADT(global_space)
-// local space
-SYCL_PLOADT(local_space)
-#undef SYCL_PLOADT
-#endif
-
-template <typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt(const Eigen::TensorSycl::internal::RangeAccess<
-       cl::sycl::access::mode::read_write,
-       typename unpacket_traits<packet_type>::type>& from) {
-  return ploadt<packet_type, Alignment>(from.get_pointer());
-}
-#ifdef SYCL_DEVICE_ONLY
-
-// private_space
-#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)                 \
+#define SYCL_PLOAD(packet_type, AlignedType)                          \
  template <>                                                         \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                   \
-  ploadt_ro<packet_type, Alignment>(                                   \
+      pload##AlignedType<packet_type>(                                \
          const typename unpacket_traits<packet_type>::type* from) {  \
-    typedef typename unpacket_traits<packet_type>::type scalar;        \
-    auto res = packet_type(static_cast<scalar>(0));                    \
-    res.template load<cl::sycl::access::address_space::private_space>( \
-        0, const_cast<scalar*>(from));                                 \
+    using scalar = typename unpacket_traits<packet_type>::type;       \
+    typedef cl::sycl::multi_ptr<                                      \
+        const scalar, cl::sycl::access::address_space::generic_space, \
+        cl::sycl::access::decorated::no>                              \
+        multi_ptr;                                                    \
+    packet_type res{};                                                \
+    res.load(0, multi_ptr(from));                                     \
    return res;                                                       \
  }

-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
+SYCL_PLOAD(cl::sycl::cl_float4, u)
+SYCL_PLOAD(cl::sycl::cl_float4, )
+SYCL_PLOAD(cl::sycl::cl_double2, u)
+SYCL_PLOAD(cl::sycl::cl_double2, )

-#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)                    \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
-      const typename unpacket_traits<packet_type>::type* from) {           \
-    typedef typename unpacket_traits<packet_type>::type scalar;            \
-    auto res = packet_type(static_cast<scalar>(0));                        \
-    res.template load<cl::sycl::access::address_space::private_space>(     \
-        0, const_cast<scalar*>(from));                                     \
-    return res;                                                            \
-  }
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
+#undef SYCL_PLOAD

-#undef SYCL_PLOAD_SPECIAL
-
-#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)   \
+#define SYCL_PSTORE(scalar, packet_type, alignment)             \
  template <>                                                   \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
+      scalar* to, const packet_type& from) {                    \
    typedef cl::sycl::multi_ptr<                                \
-        scalar, cl::sycl::access::address_space::address_space_target>      \
+        scalar, cl::sycl::access::address_space::generic_space, \
+        cl::sycl::access::decorated::no>                        \
        multi_ptr;                                              \
    from.store(0, multi_ptr(to));                               \
  }

-// global space
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
+SYCL_PSTORE(float, cl::sycl::cl_float4, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, u)

-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
 #undef SYCL_PSTORE

-#define SYCL_PSTORE_T(address_space_target)                                 \
-  template <typename scalar, typename packet_type, int Alignment>           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(                       \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    if (Alignment)                                                          \
-      pstore(to, from);                                                     \
-    else                                                                    \
-      pstoreu(to, from);                                                    \
-  }
-
-SYCL_PSTORE_T(global_space)
-
-SYCL_PSTORE_T(local_space)
-
-#undef SYCL_PSTORE_T
-
 #define SYCL_PSET1(packet_type)                                         \
  template <>                                                           \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@ -291,22 +151,6 @@ struct get_base_packet<cl::sycl::cl_double2> {
  }
 };

-#define SYCL_PLOAD_DUP(address_space_target)                                \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup(               \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return get_base_packet<packet_type>::get_ploaddup(from);                \
-  }
-
-// global space
-SYCL_PLOAD_DUP(global_space)
-// local_space
-SYCL_PLOAD_DUP(local_space)
-#undef SYCL_PLOAD_DUP
-
 #define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
  template <>                                                              \
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
@ -325,30 +169,11 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
      const typename unpacket_traits<packet_type>::type& a) {           \
    return get_base_packet<packet_type>::set_plset(a);                  \
  }
-
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)

 #undef SYCL_PLSET

-#define SYCL_PGATHER(address_space_target)                                  \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline packet_type pgather(                             \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from,                                                             \
-      Index stride) {                                                       \
-    return get_base_packet<packet_type>::get_pgather(from, stride);         \
-  }
-
-// global space
-SYCL_PGATHER(global_space)
-// local space
-SYCL_PGATHER(local_space)
-
-#undef SYCL_PGATHER
-
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
  template <>                                                                  \
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
@ -362,24 +187,6 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)

 #undef SYCL_PGATHER_SPECILIZE

-#define SYCL_PSCATTER(address_space_target)                                 \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline void pscatter(                                   \
-      typename cl::sycl::multi_ptr<                                         \
-          typename unpacket_traits<packet_type>::type,                      \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from, Index stride) {                              \
-    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
-  }
-
-// global space
-SYCL_PSCATTER(global_space)
-// local space
-SYCL_PSCATTER(local_space)
-
-#undef SYCL_PSCATTER
-
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
  template <>                                                               \
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
@ -563,6 +370,34 @@ inline cl::sycl::cl_double2 pblend(
 }
 #endif  // SYCL_DEVICE_ONLY

+template <typename packet_type, int Alignment, typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
+          cl::sycl::access::mode::read_write, T>& from) {
+  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
+}
+
+#define SYCL_PLOAD(Alignment, AlignedType)                              \
+  template <typename packet_type>                                       \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
+      const Eigen::TensorSycl::internal::RangeAccess<                   \
+          cl::sycl::access::mode::read_write,                           \
+          typename unpacket_traits<packet_type>::type>                  \
+          from) {                                                       \
+    return ploadt_ro<packet_type, Alignment>(from);                     \
+  }
+SYCL_PLOAD(Unaligned, u)
+SYCL_PLOAD(Aligned, )
+#undef SYCL_PLOAD
+
+template <typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt(const Eigen::TensorSycl::internal::RangeAccess<
+       cl::sycl::access::mode::read_write,
+       typename unpacket_traits<packet_type>::type>& from) {
+  return ploadt<packet_type, Alignment>(from.get_pointer());
+}
+
 #define SYCL_PSTORE(alignment)                                  \
  template <typename packet_type>                               \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
--- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
+++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
@ -223,7 +223,6 @@ class PointerMapper {
        m_pointerMap.clear();
        EIGEN_THROW_X(
            std::out_of_range("The pointer is not registered in the map\n"));
-
      }
      --node;
    }
@ -550,7 +549,7 @@ struct RangeAccess {
  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
  typedef T scalar_t;
  typedef scalar_t &ref_t;
-  typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
+  typedef scalar_t *ptr_t;

  // the accessor type does not necessarily the same as T
  typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
@ -570,7 +569,12 @@ struct RangeAccess {
  RangeAccess(std::nullptr_t) : RangeAccess() {}
  // This template parameter must be removed and scalar_t should be replaced
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
-    return (access_.get_pointer().get() + offset_);
+    typedef cl::sycl::multi_ptr<scalar_t,
+                                cl::sycl::access::address_space::generic_space,
+                                cl::sycl::access::decorated::no>
+        multi_ptr;
+    multi_ptr p(access_);
+    return (p + offset_).get_raw();
  }
  template <typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@ -64,7 +64,7 @@ pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
                               cl::sycl::rounding_mode::automatic>();
  auto b1 = b.template convert<cl::sycl::cl_float,
                               cl::sycl::rounding_mode::automatic>();
-  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+  return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
 }

 template <>
--- a/bench/tensors/README
+++ b/bench/tensors/README
@ -16,5 +16,11 @@ following commands:
 1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
 2. bash eigen_sycl_bench.sh

+To compile the floating point GPU benchmarks using Intel DPCPP compiler 
+/path/to/dpcpp/bin/clang+  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda"  -std=c++17  tensor_benchmarks_sycl.cc benchmark_main.cc  -lpthread -o eigen_dpcpp_sycl
+
 Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the contraction with DPCPP: 
+/path/to/dpcpp/bin/clang++  -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../  -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17   tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
--- a/bench/tensors/tensor_contract_sycl_bench.cc
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@ -15,7 +15,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#include <SYCL/sycl.hpp>
+#include <CL/sycl.hpp>
 #include <fstream>
 #include <iostream>
 #include <chrono>
@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
-  device_.fill(a_, m_ * k_, T(12));
-  device_.fill(b_, k_ * n_, T(23));
-  device_.fill(c_, m_ * n_, T(31));
+  device_.fill(a_, a_ + (m_ * k_), T(12));
+  device_.fill(b_, b_ + (k_ * n_), T(23));
+  device_.fill(c_, c_ + (m_ * n_), T(31));

  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
@ -110,9 +110,9 @@ void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorInd

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
-  device_.memset(a_, 12, m_ * k_ * sizeof(T));
-  device_.memset(b_, 23, k_ * n_ * sizeof(T));
-  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
+  device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
+  device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));

  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@ -0,0 +1,62 @@
+include_guard()
+
+include(CheckCXXCompilerFlag)
+include(FindPackageHandleStandardArgs)
+
+if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
+   "${DPCPP_SYCL_ARCH}" STREQUAL "")
+   message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
+                       " please specify in DPCPP_SYCL_ARCH")
+endif()
+
+set(DPCPP_USER_FLAGS "" CACHE STRING 
+    "Additional user-specified compiler flags for DPC++")
+
+get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
+
+add_library(DPCPP::DPCPP INTERFACE IMPORTED)
+
+set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
+if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
+  if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
+    list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+    list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
+  elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+    list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
+    list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
+  endif()
+endif()
+
+if(UNIX)
+  set_target_properties(DPCPP::DPCPP PROPERTIES
+    INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+    INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
+    message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
+else()
+  set_target_properties(DPCPP::DPCPP PROPERTIES
+    INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
+    INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
+endif()
+
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(SB_ADD_SYCL
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+  target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
+  target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
+  get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
+  if (NOT target_type STREQUAL "OBJECT_LIBRARY")
+    target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+  endif()
+endfunction()
--- a/test/main.h
+++ b/test/main.h
@ -18,6 +18,9 @@
 #include <vector>
 #include <typeinfo>
 #include <functional>
+#ifdef EIGEN_USE_SYCL
+#include <CL/sycl.hpp>
+#endif

 // The following includes of STL headers have to be done _before_ the
 // definition of macros min() and max().  The reason is that many STL
@ -121,9 +124,7 @@ struct imag {};
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
 #define B0 FORBIDDEN_IDENTIFIER
-// `I` may be defined by complex.h:
 #define I  FORBIDDEN_IDENTIFIER
-
 // Unit tests calling Eigen's blas library must preserve the default blocking size
 // to avoid troubles.
 #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
@ -301,15 +302,16 @@ namespace Eigen
      }
    #endif //EIGEN_EXCEPTIONS

-  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
    #define eigen_assert(a) \
      if( (!(a)) && (!no_more_assert) )       \
      {                                       \
        Eigen::no_more_assert = true;         \
-        if(report_on_cerr_on_assert_failure)  \
+        if(report_on_cerr_on_assert_failure) { \
          eigen_plain_assert(a);              \
-        else                                  \
+        } else {                                  \
          EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
+        } \
      }

    #ifdef EIGEN_EXCEPTIONS
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -170,10 +170,10 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    m_rightImpl.cleanup();
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {

    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@ -1185,7 +1185,6 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return derived();
    }
-
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
    EIGEN_DEVICE_FUNC
@ -1195,3 +1194,4 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 } // end namespace Eigen

 #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -438,13 +438,13 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
    : Base(op, device)
    { }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    return this->m_impl.coeffRef(this->srcCoeff(index));
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    if (this->isInnerChipping()) {
      // m_stride is equal to 1, so let's avoid the integer division.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -331,7 +331,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    // Collect dimension-wise indices (subs).
    array<Index, Base::NumDims> subs;
@ -360,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    const int packetSize = PacketType<CoeffReturnType, Device>::size;
    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -362,10 +362,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
    const OutputKernelType m_output_kernel;
 };

-
 template<typename Derived>
-struct TensorContractionEvaluatorBase : internal::no_assignment_operator
-{
+struct TensorContractionEvaluatorBase {
  typedef typename internal::traits<Derived>::Indices Indices;
  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
  typedef typename internal::traits<Derived>::RightArgType RightArgType;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@ -597,7 +597,7 @@ class TensorContractionKernel {
                                                                const TripleDim triple_dim_)
      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
@ -636,7 +636,7 @@ class TensorContractionKernel {
  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
  // it only compute the block on each thread's private memory space
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
-                                                                    PacketReturnType *privateRes) {
+                                                                    PacketReturnType *privateRes) const {
    StorageIndex idx = 0;
    EIGEN_CONSTEXPR StorageIndex lhs_stride =
        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
@ -661,7 +661,7 @@ class TensorContractionKernel {
  // class.
  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
-                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
    };
@ -713,7 +713,7 @@ class TensorContractionKernel {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
      std::enable_if_t<contract_tp == contraction_type::no_local>
      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) const {
    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
@ -833,7 +833,8 @@ class TensorContractionKernel {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
                                                                    ThreadProperties<StorageIndex> &thread_properties,
                                                                    TiledMemory &tiled_input_block,
-                                                                    PacketReturnType *privateRes, bool &db_offset) {
+                                                                    PacketReturnType *privateRes, bool &db_offset) const {
+
    // Tiling the Rhs block from global to local memory
    extract_block<RHSBlockProperties, is_internal_block>(
        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
@ -871,7 +872,7 @@ class TensorContractionKernel {
  template <bool is_internal_block, typename OutPtr>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
                                                           ThreadProperties<StorageIndex> &thread_properties,
-                                                           OutPtr out_ptr) {
+                                                           OutPtr out_ptr) const {
    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
    // Allocate register space
    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
@ -897,7 +898,7 @@ class TensorContractionKernel {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
      std::enable_if_t<contract_tp == contraction_type::local>
      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) const {
    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
@ -1035,7 +1036,7 @@ struct GeneralVectorTensor {
        nonContractDim(nonContractDim_),
        contractDim(contractDim_) {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
    auto scratch_ptr = scratch.get_pointer();
    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
@ -1252,7 +1253,8 @@ struct GeneralScalarContraction {
                           const StorageIndex rng_)
      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}

-  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
+
    auto out_ptr = out_res.get_pointer();
    auto scratch_ptr = scratch.get_pointer().get();

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -57,10 +57,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
        input_range(input_range_) {}

  template <typename BooleanDim2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
    return (boolean_check[0] && boolean_check[1]);
  }
-  void operator()(cl::sycl::nd_item<2> itemID) {
+  void operator()(cl::sycl::nd_item<2> itemID)  const {
    auto buffer_ptr = buffer_acc.get_pointer();
    auto kernel_ptr = kernel_filter.get_pointer();
    // the required row to be calculated for the for each plane in shered memory
@ -123,11 +123,11 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
        kernel_size(kernel_size_),
        input_range(input_range_) {}
  template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
  }

-  void operator()(cl::sycl::nd_item<3> itemID) {
+  void operator()(cl::sycl::nd_item<3> itemID) const {
    auto buffer_ptr = buffer_acc.get_pointer();
    auto kernel_ptr = kernel_filter.get_pointer();
    // the required row to be calculated for the for each plane in shered memory
@ -212,10 +212,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
        input_range(input_range_),
        numP(numP_) {}
  template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
  }
-  void operator()(cl::sycl::nd_item<3> itemID) {
+  void operator()(cl::sycl::nd_item<3> itemID) const {
    auto buffer_ptr = buffer_acc.get_pointer();
    auto kernel_ptr = kernel_filter.get_pointer();
    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -31,8 +31,7 @@ struct SyclDeviceInfo {
                .template get_info<cl::sycl::info::device::local_mem_type>()),
        max_work_item_sizes(
            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_item_sizes>()),
+                .template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
        max_mem_alloc_size(
            queue.get_device()
                .template get_info<
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -159,10 +159,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
  }
 #endif

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
    m_buffer[i] = m_impl.coeff(i);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
  }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -98,7 +98,7 @@ struct TensorEvaluator
    return m_data[index];
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
    eigen_assert(m_data != NULL);
    return m_data[index];
  }
@ -122,7 +122,7 @@ struct TensorEvaluator
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
  }
@ -137,7 +137,7 @@ struct TensorEvaluator
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
-  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+  coeffRef(const array<DenseIndex, NumCoords>& coords) const {
    eigen_assert(m_data != NULL);
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      return m_data[m_dims.IndexOfColMajor(coords)];
@ -978,7 +978,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  TensorEvaluator<ElseArgType, Device> m_elseImpl;
 };

-
 } // end namespace Eigen

+#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
+template <typename Derived, typename Device>
+struct cl::sycl::is_device_copyable<
+    Eigen::TensorEvaluator<Derived, Device>,
+    std::enable_if_t<!std::is_trivially_copyable<
+        Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
+#endif
+
 #endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -688,12 +688,12 @@ struct ExecExprFunctorKernel {
      : evaluator(evaluator_), range(range_) {}

  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
-      cl::sycl::nd_item<1> itemID) {
+      cl::sycl::nd_item<1> itemID) const {
    compute(itemID);
  }
  template <bool is_vec = Evaluator::PacketAccess>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
-  compute(const cl::sycl::nd_item<1>& itemID) {
+  compute(const cl::sycl::nd_item<1>& itemID) const {
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
    Index total_threads = itemID.get_global_range(0);

@ -703,7 +703,7 @@ struct ExecExprFunctorKernel {
  }
  template <bool is_vec = Evaluator::PacketAccess>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
-  compute(const cl::sycl::nd_item<1>& itemID) {
+  compute(const cl::sycl::nd_item<1>& itemID) const {
    const Index vectorizedRange =
        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@ -202,12 +202,12 @@ template<typename ArgType, typename Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    return this->m_impl.coeffRef(index);
  }
  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    this->m_impl.template writePacket<StoreMode>(index, x);
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -267,13 +267,13 @@ template<typename NewDimensions, typename ArgType, typename Device>
      TensorBlockDesc;
  //===--------------------------------------------------------------------===//

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    return this->m_impl.coeffRef(index);
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    this->m_impl.template writePacket<StoreMode>(index, x);
  }
@ -733,7 +733,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
    : Base(op, device)
    { }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    if (this->m_is_identity) {
      return this->m_impl.coeffRef(index);
@ -743,7 +743,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    if (this->m_is_identity) {
      this->m_impl.template writePacket<StoreMode>(index, x);
@ -1085,7 +1085,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef Strides Dimensions;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    if (this->m_is_identity) {
      return this->m_impl.coeffRef(index);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -895,7 +895,7 @@ static constexpr bool RunningOnGPU = false;
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
-    m_result.bind(cgh);
+    if(m_result) m_result.bind(cgh);
  }
 #endif

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@ -87,7 +87,7 @@ struct SecondStepFullReducer {
  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}

-  void operator()(cl::sycl::nd_item<1> itemID) {
+  void operator()(cl::sycl::nd_item<1> itemID) const {
    // Our empirical research shows that the best performance will be achieved
    // when there is only one element per thread to reduce in the second step.
    // in this step the second step reduction time is almost negligible.
@ -141,11 +141,11 @@ class FullReductionKernelFunctor {
                             Index rng_, OpType op_)
      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}

-  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }

  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
+      const cl::sycl::nd_item<1> &itemID) const {
    auto output_ptr = final_output.get_pointer();
    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
    Index globalid = itemID.get_global_id(0);
@ -184,7 +184,7 @@ class FullReductionKernelFunctor {

  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
+      const cl::sycl::nd_item<1> &itemID) const {
    auto output_ptr = final_output.get_pointer();
    Index globalid = itemID.get_global_id(0);
    Index localid = itemID.get_local_id(0);
@ -228,14 +228,16 @@ class GenericNondeterministicReducer {
        range(range_),
        num_values_to_reduce(num_values_to_reduce_) {}

-  void operator()(cl::sycl::nd_item<1> itemID) {
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    //This is to bypass the statefull condition in Eigen meanReducer
+    Op non_const_functor;
+    std::memcpy(&non_const_functor, &functor, sizeof (Op));
    auto output_accessor_ptr = output_accessor.get_pointer();
-    /// const cast added as a naive solution to solve the qualifier drop error
    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
    if (globalid < range) {
      CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
-          evaluator, evaluator.firstInput(globalid), functor, &accum);
+          evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
    }
  }
@ -281,7 +283,7 @@ struct PartialReductionKernel {
        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
-                                                                 CoeffReturnType &accumulator) {
+                                                                 CoeffReturnType &accumulator) const {
    if (globalPId >= num_coeffs_to_preserve) {
      return;
    }
@ -298,7 +300,7 @@ struct PartialReductionKernel {
      global_offset += per_thread_global_stride;
    }
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
    const Index linearLocalThreadId = itemID.get_local_id(0);
    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
@ -380,7 +382,7 @@ struct SecondStepPartialReduction {
        num_coeffs_to_preserve(num_coeffs_to_preserve_),
        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
    const Index globalId = itemID.get_global_id(0);

    if (globalId >= num_coeffs_to_preserve) return;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -441,12 +441,12 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const Dimensions& dimensions() const { return this->m_dimensions; }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
    return this->m_impl.coeffRef(this->reverseIndex(index));
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x) {
+  void writePacket(Index index, const PacketReturnType& x) const {
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());

    // This code is pilfered from TensorMorphing.h
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@ -109,28 +109,28 @@ struct ScanKernelFunctor {
  template <scan_step sst = stp, typename Input>
  std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
      EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
+      read(const Input &inpt, Index global_id) const {
    return inpt.coeff(global_id);
  }

  template <scan_step sst = stp, typename Input>
  std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
      EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
+      read(const Input &inpt, Index global_id) const {
    return inpt[global_id];
  }

  template <scan_step sst = stp, typename InclusiveOp>
  std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
    inclusive_op();
  }

  template <scan_step sst = stp, typename InclusiveOp>
  std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp) {}
+  first_step_inclusive_Operation(InclusiveOp) const {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
    auto out_ptr = out_accessor.get_pointer();
    auto tmp_ptr = temp_accessor.get_pointer();
    auto scratch_ptr = scratch.get_pointer().get();
@ -307,7 +307,7 @@ struct ScanAdjustmentKernelFunctor {
        scanParameters(scanParameters_),
        accumulator(accumulator_) {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
    auto in_ptr = in_accessor.get_pointer();
    auto out_ptr = out_accessor.get_pointer();

@ -473,7 +473,7 @@ struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
  typedef typename Self::CoeffReturnType CoeffReturnType;
  typedef typename Self::Storage Storage;
  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  void operator()(Self &self, EvaluatorPointerType data) {
+  void operator()(Self &self, EvaluatorPointerType data) const {
    const Index total_size = internal::array_prod(self.dimensions());
    const Index scan_size = self.size();
    const Index scan_stride = self.stride();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -390,13 +390,13 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
      : Base(op, device)
  { }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
  {
    return this->m_impl.coeffRef(this->srcCoeff(index));
  }

  template <int StoreMode> EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -288,13 +288,13 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
  {
    return this->m_impl.coeffRef(this->srcCoeff(index));
  }

  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
+  void writePacket(Index index, const PacketReturnType& x) const
  {
    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
--- a/unsupported/doc/examples/SYCL/CMakeLists.txt
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@ -2,9 +2,8 @@ FILE(GLOB examples_SRCS "*.cpp")

 set(EIGEN_SYCL ON)
 list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
-if(EIGEN_SYCL_TRISYCL)
 set(CMAKE_CXX_STANDARD 17)
-else(EIGEN_SYCL_TRISYCL)
+if(EIGEN_SYCL_ComputeCpp)
  if(MSVC)
    list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
  else()
@ -22,7 +21,7 @@ else(EIGEN_SYCL_TRISYCL)
      -no-serial-memop
      -Xclang
      -cl-mad-enable)
-endif(EIGEN_SYCL_TRISYCL)
+endif(EIGEN_SYCL_ComputeCpp)

 FOREACH(example_src ${examples_SRCS})
  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -122,6 +122,7 @@ ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")

 if(EIGEN_TEST_SYCL)
  set(EIGEN_SYCL ON)
+  set(CMAKE_CXX_STANDARD 17)
  # Forward CMake options as preprocessor definitions
  if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
    add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
@ -172,10 +173,7 @@ if(EIGEN_TEST_SYCL)
    add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
  endif()

-  if(EIGEN_SYCL_TRISYCL)
-    # triSYCL now requires c++17.
-    set(CMAKE_CXX_STANDARD 17)
-  else()
+  if(EIGEN_SYCL_ComputeCpp)
    if(MSVC)
      list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
    else()
@ -193,7 +191,7 @@ if(EIGEN_TEST_SYCL)
        -no-serial-memop
        -Xclang
        -cl-mad-enable)
-  endif()
+  endif(EIGEN_SYCL_ComputeCpp)

  ei_add_test(cxx11_tensor_sycl)
  ei_add_test(cxx11_tensor_image_op_sycl)
@ -409,4 +407,3 @@ if (EIGEN_TEST_HIP)
  endif()

 endif()
-
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@ -27,17 +27,64 @@ using Eigen::TensorMap;

 // Functions used to compare the TensorMap implementation on the device with
 // the equivalent on the host
-namespace cl {
-namespace sycl {
-template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+namespace SYCL {
+template <typename T> T abs(T x) { 
+  return cl::sycl::abs(x);
+}
+
+template <> float abs(float x) { 
+  return cl::sycl::fabs(x);
+}
+
+template <> double abs(double x) { 
+  return cl::sycl::fabs(x);
+}
+
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+template <typename T> T cwiseMax(T x, T y) { 
+return cl::sycl::max(x, y);
+}
+template <typename T> T cwiseMin(T x, T y) { 
+  return cl::sycl::min(x, y); 
 }
 }

+
+#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC)                        \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x)  {                                 \
+      return SYCL::FUNC(x);                                        \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) {                       \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT_NON_SYCL(abs)
+DECLARE_UNARY_STRUCT_NON_SYCL(square)
+DECLARE_UNARY_STRUCT_NON_SYCL(cube)
+DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
+
+#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC)                                                 \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y){                                               \
+      return SYCL::FUNC(x, y);                                                               \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) {                        \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
+DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
+
+
 struct EqualAssignment {
  template <typename Lhs, typename Rhs>
  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
@ -119,12 +166,9 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
    }                                                              \
  };

-DECLARE_UNARY_STRUCT(abs)
+
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
-DECLARE_UNARY_STRUCT(square)
-DECLARE_UNARY_STRUCT(cube)
-DECLARE_UNARY_STRUCT(inverse)
 DECLARE_UNARY_STRUCT(tanh)
 DECLARE_UNARY_STRUCT(exp)
 DECLARE_UNARY_STRUCT(expm1)
@ -288,8 +332,6 @@ void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
    }                                                                                        \
  };

-DECLARE_BINARY_STRUCT(cwiseMax)
-DECLARE_BINARY_STRUCT(cwiseMin)

 #define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
  struct op_##NAME {                                                      \
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@ -23,6 +23,13 @@
 #include <stdint.h>
 #include <iostream>

+#ifdef SYCL_COMPILER_IS_DPCPP
+template <typename T>
+struct cl::sycl::is_device_copyable<
+    const OffByOneScalar<T>,
+    std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
+#endif
+
 template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  IndexType sizeDim1 = 100;