[SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch.

* Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake
2025-07-22 12:54:26 +08:00 · 2019-11-28 10:08:54 +00:00 · 2019-11-28 10:08:54 +00:00 · 00f32752f7
commit 00f32752f7
parent ea51a9eace
56 changed files with 7336 additions and 4826 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -492,6 +492,21 @@ if(EIGEN_TEST_SYCL)
  else()
    message(STATUS "Using ComputeCPP SYCL")
    include(FindComputeCpp)
    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
    if (NOT MSVC)
      set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON)
    endif()
    option(COMPUTECPP_USE_COMPILER_DRIVER
      "Use ComputeCpp driver instead of a 2 steps compilation"
      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
    )
  endif(EIGEN_SYCL_TRISYCL)
  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
  if(EIGEN_DONT_VECTORIZE_SYCL)
    message(STATUS "Disabling SYCL vectorization in tests/examples")
    # When disabling SYCL vectorization, also disable Eigen default vectorization
    add_definitions(-DEIGEN_DONT_VECTORIZE=1)
    add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1)
  endif()
 endif()
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@ -161,6 +161,8 @@ struct PacketWrapper<PacketReturnType, 4> {
        eigen_assert(false && "INDEX MUST BE BETWEEN 0 and 3");
        abort();
    }
    __builtin_unreachable();
  }
  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(
      Scalar in, Scalar other) {
@ -203,6 +205,8 @@ struct PacketWrapper<PacketReturnType, 2> {
        eigen_assert(false && "INDEX MUST BE BETWEEN 0 and 1");
        abort();
    }
    __builtin_unreachable();
  }
  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(
      Scalar in, Scalar other) {
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@ -240,15 +240,19 @@
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
+      #ifndef EIGEN_USE_SYCL 
        #define EIGEN_VECTORIZE_AVX
      #endif
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
    #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
+      #ifndef EIGEN_USE_SYCL 
-      #define EIGEN_VECTORIZE_AVX
+        #define EIGEN_VECTORIZE_AVX2
        #define EIGEN_VECTORIZE_AVX
      #endif
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
@ -267,19 +271,23 @@
      #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
      #endif
      #endif
-      #define EIGEN_VECTORIZE_AVX512
+      #ifndef EIGEN_USE_SYCL
-      #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX512
-      #define EIGEN_VECTORIZE_AVX
+        #define EIGEN_VECTORIZE_AVX2
        #define EIGEN_VECTORIZE_AVX
      #endif
      #define EIGEN_VECTORIZE_FMA
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
-      #ifdef __AVX512DQ__
+      #ifndef EIGEN_USE_SYCL
-        #define EIGEN_VECTORIZE_AVX512DQ
+        #ifdef __AVX512DQ__
-      #endif
+          #define EIGEN_VECTORIZE_AVX512DQ
-      #ifdef __AVX512ER__
+        #endif
-        #define EIGEN_VECTORIZE_AVX512ER
+        #ifdef __AVX512ER__
          #define EIGEN_VECTORIZE_AVX512ER
        #endif
      #endif
    #endif
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -854,7 +854,7 @@
  #ifndef EIGEN_DONT_VECTORIZE
    #define EIGEN_DONT_VECTORIZE
  #endif
-  #define EIGEN_DEVICE_FUNC __attribute__((always_inline))
+  #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
 // All functions callable from CUDA/HIP code must be qualified with __device__
 #elif defined(EIGEN_GPUCC) 
    #define EIGEN_DEVICE_FUNC __host__ __device__
--- a/bench/tensors/README
+++ b/bench/tensors/README
@ -11,15 +11,10 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU
 We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
 nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
-last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
-g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+following commands:
 1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
 2. bash eigen_sycl_bench.sh
-To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code):
+Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
-1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
 {ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1
 2. The host compilation pass that generates the final host binary.
 clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o
 clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I  {ComputeCpp_ROOT}/include/ -L  {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl
 export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib
 3. Run the benchmark
 ./tensor_benchmark_sycl
--- a/bench/tensors/eigen_sycl_bench.sh
+++ b/bench/tensors/eigen_sycl_bench.sh
@ -0,0 +1,30 @@
 rm -f tensor_benchmark_sycl
 : "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
 echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
 ${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
 tensor_benchmarks_sycl.cc \
 benchmark_main.cc \
 -I ../../ \
 -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
 -std=c++11 \
 -march=native \
 -O3 \
 -DNDEBUG \
 -DEIGEN_MPL2_ONLY \
 -DEIGEN_USE_SYCL=1 \
 -DEIGEN_SYCL_LOCAL_MEM=1 \
 -no-serial-memop \
 -mllvm \
 -inline-threshold=10000 \
 -fsycl-ih-last \
 -sycl-driver \
 -Xclang -cl-mad-enable \
 -lOpenCL \
 -lComputeCpp \
 -lpthread \
 -o \
 tensor_benchmark_sycl\
 ${@:1}
 export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
 ./tensor_benchmark_sycl
--- a/bench/tensors/eigen_sycl_bench_contract.sh
+++ b/bench/tensors/eigen_sycl_bench_contract.sh
@ -0,0 +1,7 @@
 rm -f tensor_contract_sycl_bench
 : "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
 echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
 ${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++  tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
 export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
 ./tensor_contract_sycl_bench
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@ -27,6 +27,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    initialize();
  }
  BenchmarkSuite(const Device& device, size_t m, size_t k)
      : m_(1), k_(k), n_(m), device_(device) {
    initialize();
  }
  ~BenchmarkSuite() {
    device_.deallocate(a_);
    device_.deallocate(b_);
@ -79,6 +84,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    sizes[0] = m_;
    sizes[1] = m_;
    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
    for (int iter = 0; iter < 10; ++iter) {
      C.device(device_) = C.random();
    }
 #endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = C.random();
@ -264,6 +274,7 @@ template <typename Device, typename T> class BenchmarkSuite {
    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
  }
  void broadcasting(int num_iters) {
    Eigen::array<TensorIndex, 2> size_a;
    size_a[0] = m_;
@ -406,8 +417,8 @@ for (int iter = 0; iter < 10; ++iter) {
        b_, input_size);
    Eigen::array<TensorIndex, 1> output_size;
    output_size[0] = k_;
-    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
-        c_, output_size);
+        a_, output_size);
 #ifndef EIGEN_HAS_INDEX_LIST
    Eigen::array<TensorIndex, 1> sum_along_dim;
@ -419,12 +430,12 @@ for (int iter = 0; iter < 10; ++iter) {
 #endif
 #ifdef EIGEN_USE_SYCL // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
-    C.device(device_) = B.sum(sum_along_dim);
+    A.device(device_) = B.sum(sum_along_dim);
  }
 #endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = B.sum(sum_along_dim);
+      A.device(device_) = B.sum(sum_along_dim);
    }
    // Record the number of FLOP executed per second (assuming one operation
    // per value)
@ -455,37 +466,27 @@ for (int iter = 0; iter < 10; ++iter) {
    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
  }
  // do a contraction which is equivalent to a matrix multiplication
  void contraction(int num_iters) {
-    Eigen::array<TensorIndex, 2> sizeA;
+      contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
-    sizeA[0] = m_;
+  }
    sizeA[1] = k_;
    Eigen::array<TensorIndex, 2> sizeB;
    sizeB[0] = k_;
    sizeB[1] = n_;
    Eigen::array<TensorIndex, 2> sizeC;
    sizeC[0] = m_;
    sizeC[1] = n_;
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+    void contractionRowMajor(int num_iters) {
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
-    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+  }
  void contractionRowMajorAT(int num_iters) {
      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
  }
-    typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  void contractionRowMajorBT(int num_iters) {
-    Eigen::array<DimPair, 1> dims;
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
-    dims[0] = DimPair(1, 0);
+  }
-#ifdef EIGEN_USE_SYCL // warmup for sycl
+
-    for (int iter = 0; iter < 10; ++iter) {
+  void contractionRowMajorABT(int num_iters) {
-      C.device(device_) = A.contract(B, dims);
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
     }
 #endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.contract(B, dims);
    }
    // Record the number of FLOP executed per second (size_ multiplications and
    // additions for each value in the resulting tensor)
    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
  }
  void convolution(int num_iters, int kernel_x, int kernel_y) {
@ -513,13 +514,49 @@ for (int iter = 0; iter < 10; ++iter) {
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.convolve(B, dims);
    }
-    // Record the number of FLOP executed per second (kernel_size
+    // Record the number of FLOPs executed per second (kernel_size
    // multiplications and additions for each value in the resulting tensor)
    finalizeBenchmark(static_cast<int64_t>(2) *
        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
  }
 private:
 // do a contraction which is equivalent to a matrix multiplication
  template<int Layout>
  void contraction(int num_iters, bool trans_a, bool trans_b) {
    Eigen::array<TensorIndex, 2> sizeA;
    sizeA[0] = (trans_a ? k_: m_);
    sizeA[1] = (trans_a ? m_:  k_);
    Eigen::array<TensorIndex, 2> sizeB;
    sizeB[0] = (trans_b ? n_: k_);
    sizeB[1] = (trans_b ? k_: n_);
    Eigen::array<TensorIndex, 2> sizeC;
    sizeC[0] = m_;
    sizeC[1] = n_;
    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
    TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
    typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
    Eigen::array<DimPair, 1> dims;
    TensorIndex a_contract_dim = (trans_a ? 0 : 1);
    TensorIndex b_contract_dim = (trans_b ? 1 : 0);
    dims[0] = DimPair(a_contract_dim, b_contract_dim);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
    for (int iter = 0; iter < 10; ++iter) {
      C.device(device_) = A.contract(B, dims);
     }
 #endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.contract(B, dims);
    }
    // Record the number of FLOP executed per second (size_ multiplications and
    // additions for each value in the resulting tensor)
    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
  }
  void initialize() {
    a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
    b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
@ -531,7 +568,6 @@ for (int iter = 0; iter < 10; ++iter) {
    device_.memset(b_, 23, k_ * n_ * sizeof(T));
    device_.memset(c_, 31, m_ * n_ * sizeof(T));
    //BenchmarkUseRealTime();
  }
  inline void finalizeBenchmark(int64_t num_items) {
--- a/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@ -5,19 +5,76 @@
 #include "tensor_benchmarks.h"
-#define BM_FuncGPU(FUNC)                                                       \
+cl::sycl::gpu_selector selector;
-  static void BM_##FUNC(int iters, int N) {                                    \
+Eigen::QueueInterface queue(selector);
-    StopBenchmarkTiming();                                                     \
+#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2)                      \
-    cl::sycl::gpu_selector selector;                                           \
+  static void BM_##FUNC##_##D1##x##D2(int iters, int N) {           \
-    Eigen::QueueInterface queue(selector);                                     \
+    StopBenchmarkTiming();                                          \
-    Eigen::SyclDevice device(&queue);                                          \
+    Eigen::SyclDevice device(&queue);                               \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
-    suite.FUNC(iters);                                                         \
+    suite.FUNC(iters);                                              \
-  }                                                                            \
+  }                                                                 \
  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
 BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
 BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
 BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
 BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
 BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
 BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
 BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
 BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
 BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
 BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
 BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
 BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
 BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
 BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
 BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
 BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
 BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
 BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
 BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
 BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
 BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
 BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
 #define BM_FuncGPU(FUNC)                                       \
  static void BM_##FUNC(int iters, int N) {                    \
    StopBenchmarkTiming();                                     \
    Eigen::SyclDevice device(&queue);                          \
    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
    suite.FUNC(iters);                                         \
  }                                                            \
  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 BM_FuncGPU(rowReduction);
 BM_FuncGPU(colReduction);
 BM_FuncGPU(fullReduction);
 BM_FuncGPU(memcpy);
 BM_FuncGPU(typeCasting);
 BM_FuncGPU(random);
 BM_FuncGPU(slicing);
 BM_FuncGPU(rowChip);
 BM_FuncGPU(colChip);
@ -28,40 +85,50 @@ BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
 BM_FuncGPU(algebraicFunc);
 BM_FuncGPU(transcendentalFunc);
 BM_FuncGPU(rowReduction);
 BM_FuncGPU(colReduction);
 BM_FuncGPU(fullReduction);
 // Contractions
-#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                       \
-  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {        \
-    StopBenchmarkTiming();                                                     \
+    StopBenchmarkTiming();                                              \
-    cl::sycl::gpu_selector selector;                                           \
+    Eigen::SyclDevice device(&queue);                                   \
-    Eigen::QueueInterface queue(selector);                                     \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
-    Eigen::SyclDevice device(&queue);                                          \
+    suite.FUNC(iters);                                                  \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3);        \
+  }                                                                     \
    suite.FUNC(iters);                                                         \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
 BM_FuncWithInputDimsGPU(contraction, N, N, N);
 BM_FuncWithInputDimsGPU(contraction, 64, N, N);
 BM_FuncWithInputDimsGPU(contraction, N, 64, N);
 BM_FuncWithInputDimsGPU(contraction, N, N, 64);
 BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
 BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
 BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
 BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
 BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
 BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
 // Convolutions
-#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)             \
-  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {  \
-    StopBenchmarkTiming();                                                     \
+    StopBenchmarkTiming();                                     \
-    cl::sycl::gpu_selector selector;                                           \
+    Eigen::SyclDevice device(&queue);                          \
-    Eigen::QueueInterface queue(selector);                                     \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
-    Eigen::SyclDevice device(&queue);                                          \
+    suite.FUNC(iters, DIM1, DIM2);                             \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
+  }                                                            \
    suite.FUNC(iters, DIM1, DIM2);                                             \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
 BM_FuncWithKernelDimsGPU(convolution, 7, 1);
--- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
+++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
@ -1,2 +0,0 @@
 #include "tensor_benchmarks_sycl.cc"
 #include "tensor_benchmarks_sycl.sycl"
--- a/bench/tensors/tensor_contract_sycl_bench.cc
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@ -0,0 +1,325 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_BENCH_CONTRACT_SYCL
 #define EIGEN_BENCH_CONTRACT_SYCL
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #include <SYCL/sycl.hpp>
 #include <fstream>
 #include <iostream>
 #include <chrono>
 #include <ctime>
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 std::ofstream out("Result.txt");
 std::chrono::time_point<std::chrono::system_clock> get_time(){
  std::chrono::time_point<std::chrono::system_clock> start, end;
  return std::chrono::system_clock::now();
 }
 template<typename Start, typename End, typename TensorIndex>
 void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
  std::chrono::duration<double> elapsed_seconds = end-start;
  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
    out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
    static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
 }
 // do a contraction which is equivalent to a matrix multiplication
 template<typename T, typename Device, typename TensorIndex>
 void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
  sizeA[1] = k_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = k_;
  sizeB[1] = n_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;
  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(1, 0);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
   }
 #endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
 auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
 }
 // do a contraction which is equivalent to a matrix multiplication
 template<typename T, typename Device, typename TensorIndex>
 void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
  sizeA[1] = k_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = k_;
  sizeB[1] = n_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(1, 0);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
   }
 #endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
 }
 template<typename T, typename Device, typename TensorIndex>
 void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = k_;
  sizeA[1] = m_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = k_;
  sizeB[1] = n_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(0, 0);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
   }
 #endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
 }
 template<typename T, typename Device, typename TensorIndex>
 void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
  sizeA[1] = k_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = n_;
  sizeB[1] = k_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(1, 1);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
   }
 #endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
 }
 template<typename T, typename Device, typename TensorIndex>
 void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = k_;
  sizeA[1] = m_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = n_;
  sizeB[1] = k_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(0, 1);
 #ifdef EIGEN_USE_SYCL // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
   }
 #endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
 }
 int main() {
  cl::sycl::gpu_selector selector;
  Eigen::QueueInterface queue(selector);
  Eigen::SyclDevice device(&queue);
  int64_t num_iters =20;
  for(int64_t m = 32; m <= 4096; m *= 2)
    for(int64_t k = 32; k <= 4096; k *= 2)
      for(int64_t n = 32; n <= 4096; n*= 2){
        (contraction<float>(device, num_iters, m, k, n));
        (contractionRowMajor<float>(device, num_iters, m, k, n));
        (contractionAT<float>(device, num_iters, m, k, n));
        (contractionBT<float>(device, num_iters, m, k, n));
        (contractionABT<float>(device, num_iters, m, k, n));
      }
  return 0;
  }
 #endif // EIGEN_BENCH_CONTRACT_SYCL
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@ -113,111 +113,28 @@ macro(ei_add_test_internal testname testname_with_suffix)
    add_dependencies("Build${current_subproject}" ${targetname})
    set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
  endif()
-
+  if(EIGEN_SYCL)
-endmacro()
+    # Force include of the SYCL file at the end to avoid errors.
-
+    set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1)
-# SYCL
+    # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags
-macro(ei_add_test_internal_sycl testname testname_with_suffix)
+    # to the device compiler.
-  set(targetname ${testname_with_suffix})
+    get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS)
-
+    separate_arguments(target_compile_flags)
-  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    foreach(flag ${target_compile_flags})
-    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+      if(${flag} MATCHES "^-D.*")
-  else()
+        string(REPLACE "-D" "" definition_flag ${flag})
-    set(filename ${testname}.cpp)
+        set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag})
-  endif()
+        list(REMOVE_ITEM target_compile_flags ${flag})
-
+      endif()
-  set( include_file "${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}")
+    endforeach()
-  set( bc_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.sycl")
+    set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags})
-  set( host_file "${CMAKE_CURRENT_SOURCE_DIR}/${filename}")
+    # Link against pthread and add sycl to target
-
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
-  if(NOT EIGEN_SYCL_TRISYCL)
+    find_package(Threads REQUIRED)
-    include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include)
+    target_link_libraries(${targetname} Threads::Threads)
-
+    add_sycl_to_target(TARGET ${targetname} SOURCES ${filename})
-    add_custom_command(
+  endif(EIGEN_SYCL)
-      OUTPUT ${include_file}
+endmacro(ei_add_test_internal)
      COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file}
      COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}\\\"" >> ${include_file}
      DEPENDS ${filename} ${bc_file}
      COMMENT "Building ComputeCpp integration header file ${include_file}"
      )
    # Add a custom target for the generated integration header
    add_custom_target("${testname}_integration_header_sycl" DEPENDS ${include_file})
    add_executable(${targetname} ${include_file})
    add_dependencies(${targetname} "${testname}_integration_header_sycl")
  else()
    add_executable(${targetname} ${host_file})
  endif()
  add_sycl_to_target(${targetname} ${CMAKE_CURRENT_BINARY_DIR} ${filename})
  if (targetname MATCHES "^eigen2_")
    add_dependencies(eigen2_buildtests ${targetname})
  else()
    add_dependencies(buildtests ${targetname})
  endif()
  if(EIGEN_NO_ASSERTION_CHECKING)
    ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
  else()
    if(EIGEN_DEBUG_ASSERTS)
      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
    endif()
  endif()
  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
  if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS)
    ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
  endif()
  # let the user pass flags.
  if(${ARGC} GREATER 2)
    ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
  endif()
  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
  endif()
  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
    target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
  endif()
  if(EXTERNAL_LIBS)
    target_link_libraries(${targetname} ${EXTERNAL_LIBS})
  endif()
  if(EIGEN_TEST_CUSTOM_LINKER_FLAGS)
    target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS})
  endif()
  if(${ARGC} GREATER 3)
    set(libs_to_link ${ARGV3})
    # it could be that some cmake module provides a bad library string " "  (just spaces),
    # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors).
    # so we check for strings containing only spaces.
    string(STRIP "${libs_to_link}" libs_to_link_stripped)
    string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length)
    if(${libs_to_link_stripped_length} GREATER 0)
      # notice: no double quotes around ${libs_to_link} here. It may be a list.
      target_link_libraries(${targetname} ${libs_to_link})
    endif()
  endif()
  add_test(${testname_with_suffix} "${targetname}")
  # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT
  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
  if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
    set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
    add_dependencies("Build${current_subproject}" ${targetname})
    set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
  endif()
 endmacro()
 # Macro to add a test
 #
 # the unique mandatory parameter testname must correspond to a file
@ -296,40 +213,6 @@ macro(ei_add_test testname)
  endif()
 endmacro()
 macro(ei_add_test_sycl testname)
  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n")
  set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
  else()
    set(filename ${testname}.cpp)
  endif()
  file(READ "${filename}" test_source)
  set(parts 0)
  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
         occurrences "${test_source}")
  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}")
  list(REMOVE_DUPLICATES suffixes)
  if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
    add_custom_target(${testname})
    foreach(suffix ${suffixes})
      ei_add_test_internal_sycl(${testname} ${testname}_${suffix}
        "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
      add_dependencies(${testname} ${testname}_${suffix})
    endforeach()
  else()
    set(symbols_to_enable_all_parts "")
    foreach(suffix ${suffixes})
      set(symbols_to_enable_all_parts
        "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1")
    endforeach()
    ei_add_test_internal_sycl(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}")
  endif()
 endmacro()
 # adds a failtest, i.e. a test that succeed if the program fails to compile
 # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON
 # so here we're just running CMake commands immediately, we're not adding any targets.
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@ -2,7 +2,7 @@
 # FindComputeCpp
 #---------------
 #
-#   Copyright 2016 Codeplay Software Ltd.
+#   Copyright 2016-2018 Codeplay Software Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use these files except in compliance with the License.
@ -23,244 +23,421 @@
 #
 #  Tools for finding and building with ComputeCpp.
 #
-#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
+#  User must define ComputeCpp_DIR pointing to the ComputeCpp
-#   installation.
+#  installation.
 #
 #  Latest version of this file can be found at:
 #    https://github.com/codeplaysoftware/computecpp-sdk
-# Require CMake version 3.2.2 or higher
+cmake_minimum_required(VERSION 3.4.3)
-cmake_minimum_required(VERSION 3.2.2)
+include(FindPackageHandleStandardArgs)
 # Check that a supported host compiler can be found
 if(CMAKE_COMPILER_IS_GNUCXX)
    # Require at least gcc 4.8
    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
      message(FATAL_ERROR
        "host compiler - Not found! (gcc version must be at least 4.8)")
    else()
      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
    endif()
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
    # Require at least clang 3.6
    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
      message(FATAL_ERROR
        "host compiler - Not found! (clang version must be at least 3.6)")
    else()
      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
    endif()
 else()
  message(WARNING
    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
 endif()
 set(COMPUTECPP_64_BIT_DEFAULT ON)
 option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
        ${COMPUTECPP_64_BIT_DEFAULT})
 mark_as_advanced(COMPUTECPP_64_BIT_CODE)
 option(COMPUTECPP_DISABLE_GCC_DUAL_ABI "Compile with pre-5.1 ABI" OFF)
 mark_as_advanced(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
 set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++")
 separate_arguments(COMPUTECPP_USER_FLAGS)
 mark_as_advanced(COMPUTECPP_USER_FLAGS)
-# Find OpenCL package
+set(COMPUTECPP_BITCODE "spir64" CACHE STRING
  "Bitcode type to use as SYCL target in compute++")
 mark_as_advanced(COMPUTECPP_BITCODE)
 find_package(OpenCL REQUIRED)
-# Find ComputeCpp packagee
+# Find ComputeCpp package
-if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
+
-  message(FATAL_ERROR
+if(DEFINED ComputeCpp_DIR)
-    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
+  set(computecpp_find_hint ${ComputeCpp_DIR})
-else()
+elseif(DEFINED ENV{COMPUTECPP_DIR})
-  message(STATUS "ComputeCpp package - Found")
+  set(computecpp_find_hint $ENV{COMPUTECPP_DIR})
 endif()
-# Obtain the path to compute++
+# Used for running executables on the host
-find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
+set(computecpp_host_find_hint ${computecpp_find_hint})
  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
 if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
  message(STATUS "compute++ - Found")
 else()
  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
 endif()
-# Obtain the path to computecpp_info
+if(CMAKE_CROSSCOMPILING)
-find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
+  # ComputeCpp_HOST_DIR is used to find executables that are run on the host
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
+  if(DEFINED ComputeCpp_HOST_DIR)
-if (EXISTS ${COMPUTECPP_INFO_TOOL})
+    set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR})
-  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
+  elseif(DEFINED ENV{COMPUTECPP_HOST_DIR})
-  message(STATUS "computecpp_info - Found")
+    set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR})
 else()
  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
 endif()
 # Obtain the path to the ComputeCpp runtime library
 find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
 if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
  message(STATUS "libComputeCpp.so - Found")
 else()
  message(FATAL_ERROR "libComputeCpp.so - Not found!")
 endif()
 # Obtain the ComputeCpp include directory
 set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
 if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
  message(FATAL_ERROR "ComputeCpp includes - Not found!")
 else()
  message(STATUS "ComputeCpp includes - Found")
 endif()
 # Obtain the package version
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
 if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
  message(FATAL_ERROR "Package version - Error obtaining version!")
 else()
  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
 endif()
 # Obtain the device compiler flags
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
 if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
 else()
  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
 endif()
 # Check if the platform is supported
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
 if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
  message(FATAL_ERROR "platform - Error checking platform support!")
 else()
  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
    message(STATUS "platform - your system can support ComputeCpp")
  else()
    message(STATUS "platform - your system CANNOT support ComputeCpp")
  endif()
 endif()
-set(COMPUTECPP_USER_FLAGS
+find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++
-  -sycl-compress-name
+  HINTS ${computecpp_host_find_hint}
-  -Wall
+  PATH_SUFFIXES bin)
-  -no-serial-memop
+
-  -DEIGEN_NO_ASSERTION_CHECKING=1
+find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info
  HINTS ${computecpp_host_find_hint}
  PATH_SUFFIXES bin)
 find_library(COMPUTECPP_RUNTIME_LIBRARY
  NAMES ComputeCpp ComputeCpp_vs2015
  HINTS ${computecpp_find_hint}
  PATH_SUFFIXES lib
  DOC "ComputeCpp Runtime Library")
 find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG
  NAMES ComputeCpp ComputeCpp_vs2015_d
  HINTS ${computecpp_find_hint}
  PATH_SUFFIXES lib
  DOC "ComputeCpp Debug Runtime Library")
 find_path(ComputeCpp_INCLUDE_DIRS
  NAMES "CL/sycl.hpp"
  HINTS ${computecpp_find_hint}/include
  DOC "The ComputeCpp include directory")
 get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE)
 get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE)
 set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH
    "The root of the ComputeCpp install")
 if(NOT ComputeCpp_INFO_EXECUTABLE)
  message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR")
 else()
  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version"
    OUTPUT_VARIABLE ComputeCpp_VERSION
    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
    message(WARNING "Package version - Error obtaining version!")
  endif()
  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported"
    OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
    message(WARNING "platform - Error checking platform support!")
  else()
    mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
    if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
      message(STATUS "platform - your system can support ComputeCpp")
    else()
      message(WARNING "platform - your system CANNOT support ComputeCpp")
    endif()
  endif()
 endif()
 find_package_handle_standard_args(ComputeCpp
  REQUIRED_VARS ComputeCpp_ROOT_DIR
                ComputeCpp_DEVICE_COMPILER_EXECUTABLE
                ComputeCpp_INFO_EXECUTABLE
                COMPUTECPP_RUNTIME_LIBRARY
                COMPUTECPP_RUNTIME_LIBRARY_DEBUG
                ComputeCpp_INCLUDE_DIRS
  VERSION_VAR ComputeCpp_VERSION)
 mark_as_advanced(ComputeCpp_ROOT_DIR
                 ComputeCpp_DEVICE_COMPILER_EXECUTABLE
                 ComputeCpp_INFO_EXECUTABLE
                 COMPUTECPP_RUNTIME_LIBRARY
                 COMPUTECPP_RUNTIME_LIBRARY_DEBUG
                 ComputeCpp_INCLUDE_DIRS
                 ComputeCpp_VERSION)
 if(NOT ComputeCpp_FOUND)
  return()
 endif()
 list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata)
 mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS)
 if(CMAKE_CROSSCOMPILING)
  if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN)
    list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR})
  endif()
  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR})
  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE})
 endif()
 list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE})
 message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
 if(NOT TARGET OpenCL::OpenCL)
  add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
  set_target_properties(OpenCL::OpenCL PROPERTIES
    IMPORTED_LOCATION             "${OpenCL_LIBRARIES}"
    INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}"
  )
 endif()
 if(NOT TARGET ComputeCpp::ComputeCpp)
  add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED)
  set_target_properties(ComputeCpp::ComputeCpp PROPERTIES
    IMPORTED_LOCATION_DEBUG          "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
    IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
    IMPORTED_LOCATION                "${COMPUTECPP_RUNTIME_LIBRARY}"
    INTERFACE_INCLUDE_DIRECTORIES    "${ComputeCpp_INCLUDE_DIRS}"
    INTERFACE_LINK_LIBRARIES         "OpenCL::OpenCL"
  )
 endif()
 # This property allows targets to specify that their sources should be
 # compiled with the integration header included after the user's
 # sources, not before (e.g. when an enum is used in a kernel name, this
 # is not technically valid SYCL code but can work with ComputeCpp)
 define_property(
  TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER
  BRIEF_DOCS "Include integration header after user source"
  FULL_DOCS "Changes compiler arguments such that the source file is
  actually the integration header, and the .cpp file is included on
  the command line so that it is seen by the compiler first. Enables
  non-standards-conformant SYCL code to compile with ComputeCpp."
 )
 define_property(
  TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS
  BRIEF_DOCS "Interface compile flags to provide compute++"
  FULL_DOCS  "Set additional compile flags to pass to compute++ when compiling
  any target which links to this one."
 )
 define_property(
  SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS
  BRIEF_DOCS "Source file compile flags for compute++"
  FULL_DOCS  "Set additional compile flags for compiling the SYCL integration
  header for the given source file."
 )
 ####################
-#   __build_sycl
+#   __build_ir
 ####################
 #
 #  Adds a custom target for running compute++ and adding a dependency for the
 #  resulting integration header.
 #
-#  targetName : Name of the target.
+#  TARGET : Name of the target.
-#  sourceFile : Source file to be compiled.
+#  SOURCE : Source file to be compiled.
-#  binaryDir : Intermediate directory to output the integration header.
+#  COUNTER : Counter included in name of custom target. Different counter
-#  fileCounter : Counter included in name of custom target. Different counter
+#       values prevent duplicated names of custom target when source files with
-#       values prevent duplicated names of custom target when source files with the same name,
+#       the same name, but located in different directories, are used for the
-#       but located in different directories, are used for the same target.
+#       same target.
 #
-function(__build_spir targetName sourceFile binaryDir fileCounter)
+function(__build_ir)
  set(options)
  set(one_value_args
    TARGET
    SOURCE
    COUNTER
  )
  set(multi_value_args)
  cmake_parse_arguments(SDK_BUILD_IR
    "${options}"
    "${one_value_args}"
    "${multi_value_args}"
    ${ARGN}
  )
  get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME)
-  # Retrieve source file name.
+  # Set the path to the integration header.
-  get_filename_component(sourceFileName ${sourceFile} NAME)
+  # The .sycl filename must depend on the target so that different targets
  # using the same source file will be generated with a different rule.
  set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName})
  set(outputSyclFile ${baseSyclName}.sycl)
  set(depFileName ${baseSyclName}.sycl.d)
-  # Set the path to the Sycl file.
+  set(include_directories "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},INCLUDE_DIRECTORIES>")
-  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
+  set(compile_definitions "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},COMPILE_DEFINITIONS>")
  set(generated_include_directories
    $<$<BOOL:${include_directories}>:-I\"$<JOIN:${include_directories},\"\t-I\">\">)
  set(generated_compile_definitions
    $<$<BOOL:${compile_definitions}>:-D$<JOIN:${compile_definitions},\t-D>>)
-  # Add any user-defined include to the device compiler
+  # Obtain language standard of the file
-  set(device_compiler_includes "")
+  set(device_compiler_cxx_standard)
-  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
+  get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD)
-    INCLUDE_DIRECTORIES)
+  if (targetCxxStandard MATCHES 17)
-  foreach(directory ${includeDirectories})
+    set(device_compiler_cxx_standard "-std=c++1z")
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
+  elseif (targetCxxStandard MATCHES 14)
-  endforeach()
+    set(device_compiler_cxx_standard "-std=c++14")
-  get_target_property(targetIncludeDirectories ${targetName} INCLUDE_DIRECTORIES)
+  elseif (targetCxxStandard MATCHES 11)
-  foreach(directory ${targetIncludeDirectories})
+    set(device_compiler_cxx_standard "-std=c++11")
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
+  elseif (targetCxxStandard MATCHES 98)
-  endforeach()
+    message(FATAL_ERROR "SYCL applications cannot be compiled using C++98")
-  if (CMAKE_INCLUDE_PATH)
+  else ()
-    foreach(directory ${CMAKE_INCLUDE_PATH})
+    set(device_compiler_cxx_standard "")
-      set(device_compiler_includes "-I${directory}"
+  endif()
-        ${device_compiler_includes})
+
  get_property(source_compile_flags
    SOURCE ${SDK_BUILD_IR_SOURCE}
    PROPERTY COMPUTECPP_SOURCE_FLAGS
  )
  separate_arguments(source_compile_flags)
  if(source_compile_flags)
    list(APPEND computecpp_source_flags ${source_compile_flags})
  endif()
  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS
    ${device_compiler_cxx_standard}
    ${COMPUTECPP_USER_FLAGS}
    ${computecpp_source_flags}
  )
  set(ir_dependencies ${SDK_BUILD_IR_SOURCE})
  get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES)
  if(target_libraries)
    foreach(library ${target_libraries})
      list(APPEND ir_dependencies ${library})
    endforeach()
  endif()
-  set(COMPUTECPP_DEVICE_COMPILER_FLAGS
+  # Depfile support was only added in CMake 3.7
-    ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+  # CMake throws an error if it is unsupported by the generator (i. e. not ninja)
-    ${COMPUTECPP_USER_FLAGS})
+  if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND
-  # Convert argument list format
+          CMAKE_GENERATOR MATCHES "Ninja")
-  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
+    file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputSyclFile})
    set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile})
    set(enable_depfile DEPFILE ${depFileName})
  endif()
  # Add custom command for running compute++
  add_custom_command(
    OUTPUT ${outputSyclFile}
-    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
+    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
+            ${generated_include_directories}
-            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
+            ${generated_compile_definitions}
            ${device_compiler_includes}
            -o ${outputSyclFile}
-            -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
+            -c ${SDK_BUILD_IR_SOURCE}
-    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
+            ${generate_depfile}
-    IMPLICIT_DEPENDS CXX "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}"
+    DEPENDS ${ir_dependencies}
-    WORKING_DIRECTORY ${binaryDir}
+    IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE}
    ${enable_depfile}
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
    COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
-  # Add a custom target for the generated integration header
+  # Name: (user-defined name)_(source file)_(counter)_ih
-  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
+  set(headerTargetName
    ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih)
-  # Add a dependency on the integration header
+  if(NOT MSVC)
-  add_dependencies(${targetName} ${targetName}_integration_header)
+    # Add a custom target for the generated integration header
-
+    add_custom_target(${headerTargetName} DEPENDS ${outputSyclFile})
-  # Set the host compiler C++ standard to C++11
+    add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName})
  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
  # Disable GCC dual ABI on GCC 5.1 and higher
  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
      "_GLIBCXX_USE_CXX11_ABI=0")
  endif()
-endfunction()
+  # This property can be set on a per-target basis to indicate that the
  # integration header should appear after the main source listing
  get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
  if(includeAfter)
    # Change the source file to the integration header - e.g.
    # g++ -c source_file_name.cpp.sycl
    get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES)
    # Remove absolute path to source file
    list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE})
    # Remove relative path to source file
    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" ""
      rel_source_file ${SDK_BUILD_IR_SOURCE}
    )
    list(REMOVE_ITEM current_sources ${rel_source_file})
    # Add SYCL header to source list
    list(APPEND current_sources ${outputSyclFile})
    set_property(TARGET ${SDK_BUILD_IR_TARGET}
      PROPERTY SOURCES ${current_sources})
    # CMake/gcc don't know what language a .sycl file is, so tell them
    set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX)
    set(includedFile ${SDK_BUILD_IR_SOURCE})
    set(cppFile ${outputSyclFile})
  else()
    set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON)
    set(includedFile ${outputSyclFile})
    set(cppFile ${SDK_BUILD_IR_SOURCE})
  endif()
  # Force inclusion of the integration header for the host compiler
  if(MSVC)
    # Group SYCL files inside Visual Studio
    source_group("SYCL" FILES ${outputSyclFile})
    if(includeAfter)
      # Allow the source file to be edited using Visual Studio.
      # It will be added as a header file so it won't be compiled.
      set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true)
    endif()
    # Add both source and the sycl files to the VS solution.
    target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile})
    set(forceIncludeFlags "/FI${includedFile} /TP")
  else()
    set(forceIncludeFlags "-include ${includedFile} -x c++")
  endif()
  set_property(
    SOURCE ${cppFile}
    APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}"
  )
 endfunction(__build_ir)
 #######################
 #  add_sycl_to_target
 #######################
 #
 #  Adds a SYCL compilation custom command associated with an existing
-#  target and sets a dependency on that new command.
+#  target and sets a dependancy on that new command.
 #
-#  targetName : Name of the target to add a SYCL to.
+#  TARGET : Name of the target to add SYCL to.
-#  binaryDir : Intermediate directory to output the integration header.
+#  SOURCES : Source files to be compiled for SYCL.
 #  sourceFiles : Source files to be compiled for SYCL.
 #
-function(add_sycl_to_target targetName binaryDir sourceFiles)
+function(add_sycl_to_target)
  set(options)
  set(one_value_args
    TARGET
  )
  set(multi_value_args
    SOURCES
  )
  cmake_parse_arguments(SDK_ADD_SYCL
    "${options}"
    "${one_value_args}"
    "${multi_value_args}"
    ${ARGN}
  )
-  set(sourceFiles ${sourceFiles} ${ARGN})
+  # If the CXX compiler is set to compute++ enable the driver.
-  set(fileCounter 0)
+  get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME)
-  # Add custom target to run compute++ and generate the integration header
+  if("${cmakeCxxCompilerFileName}" STREQUAL "compute++")
-  foreach(sourceFile ${sourceFiles})
+    if(MSVC)
-    __build_spir(${targetName} ${sourceFile} ${binaryDir} ${fileCounter})
+      message(FATAL_ERROR "The compiler driver is not supported by this system,
-    math(EXPR fileCounter "${fileCounter} + 1")
+                           revert the CXX compiler to your default host compiler.")
-  endforeach()
+    endif()
-  # Link with the ComputeCpp runtime library
+    get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
-  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
+    if(includeAfter)
-                        PUBLIC ${OpenCL_LIBRARIES})
+      list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last)
    endif()
    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver)
    # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS
    foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS)
      get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop})
      if(NOT target_compile_options)
        set(target_compile_options "")
      endif()
      set_property(
        TARGET ${SDK_ADD_SYCL_TARGET}
        PROPERTY ${prop}
        ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
        ${target_compile_options}
        ${COMPUTECPP_USER_FLAGS}
      )
    endforeach()
  else()
    set(fileCounter 0)
    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl)
    # Add custom target to run compute++ and generate the integration header
    foreach(sourceFile ${SDK_ADD_SYCL_SOURCES})
      if(NOT IS_ABSOLUTE ${sourceFile})
        set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}")
      endif()
      __build_ir(
        TARGET     ${SDK_ADD_SYCL_TARGET}
        SOURCE     ${sourceFile}
        COUNTER    ${fileCounter}
      )
      MATH(EXPR fileCounter "${fileCounter} + 1")
    endforeach()
  endif()
-endfunction()
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
    APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp)
  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
    APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp)
 endfunction(add_sycl_to_target)
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -15,19 +15,6 @@
 #if EIGEN_HAS_CXX11
 #if defined(EIGEN_USE_SYCL)
 #undef min
 #undef max
 #undef isnan
 #undef isinf
 #undef isfinite
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <utility>
 #endif
 #include "../SpecialFunctions"
 #include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
@ -72,7 +59,7 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
-#ifdef EIGEN_USE_THREADS
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
 #include "ThreadPool"
 #endif
@ -147,7 +134,13 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorScan.h"
 #include "src/Tensor/TensorTrace.h"
-#include "src/Tensor/TensorSycl.h"
+#ifdef EIGEN_USE_SYCL
 #include "src/Tensor/TensorReductionSycl.h"
 #include "src/Tensor/TensorConvolutionSycl.h"
 #include "src/Tensor/TensorContractionSycl.h"
 #include "src/Tensor/TensorScanSycl.h"
 #endif
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@ -1,152 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 *  TensorArgMaxSycl.h
 * \brief:
 *  TensorArgMaxSycl
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
 namespace Eigen {
 namespace internal {
  template<typename Dims, typename XprType>
  struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense>
  {
    typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type;
  };
  template<typename Dims, typename XprType>
  struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1,
                typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type>
  {
    typedef TensorTupleReducerDeviceOp<Dims, XprType> type;
  };
 template<typename StrideDims, typename XprType>
 struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType>
 {
  typedef traits<XprType> XprTraits;
  typedef typename XprTraits::StorageKind StorageKind;
  typedef typename XprTraits::Index Index;
  typedef Index Scalar;
  typedef typename XprType::Nested Nested;
  typedef typename remove_reference<Nested>::type _Nested;
  static const int NumDimensions = XprTraits::NumDimensions;
  static const int Layout = XprTraits::Layout;
 };
 }// end namespace internal
 template<typename StrideDims, typename XprType>
 class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors>
 {
  public:
  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar;
  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
  typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested;
  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind;
  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index;
  typedef typename XprType::CoeffReturnType TupleType;
  typedef Index CoeffReturnType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr,
                                                              const Index return_dim,
                                                              const StrideDims strides,
                                                              const Index stride_mod, const Index stride_div)
          :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {}
  EIGEN_DEVICE_FUNC
  const typename internal::remove_all<typename XprType::Nested>::type&
  expression() const { return m_xpr; }
  EIGEN_DEVICE_FUNC
  Index return_dim() const { return m_return_dim; }
  EIGEN_DEVICE_FUNC
  const StrideDims& strides() const { return m_strides; }
  EIGEN_DEVICE_FUNC
  const Index& stride_mod() const { return m_stride_mod; }
  EIGEN_DEVICE_FUNC
  const Index& stride_div() const { return m_stride_div; }
  protected:
    typename Eigen::internal::remove_all<typename
    XprType::Nested
    >::type m_xpr;
    const Index m_return_dim;
    const StrideDims m_strides;
    const Index m_stride_mod;
    const Index m_stride_div;
 };
 // Eval as rvalue
 template<typename StrideDims, typename ArgType>
 struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>
 {
  typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType;
  typedef typename XprType::Index Index;
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::TupleType TupleType;
  typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions;
  enum {
    IsAligned =  false,
    PacketAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
    CoordAccess = false,
    RawAccess = false
  };
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockNotImplemented TensorBlockV2;
  //===--------------------------------------------------------------------===//
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,  const SyclKernelDevice& device)
      : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
      m_stride_div(op.stride_div()){}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
    return m_impl.dimensions();
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_impl.cleanup();
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
    const TupleType v = m_impl.coeff(index);
    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
  }
 typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type  data() const { return const_cast<ptr_Dev_type>(m_impl.data()); }
 protected:
 TensorEvaluator<ArgType , SyclKernelDevice> m_impl;
 const Index m_return_dim;
 const StrideDims m_strides;
 const Index m_stride_mod;
 const Index m_stride_div;
 };
 } // end namespace Eigen
 #endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -545,6 +545,10 @@ class TensorContractionInputMapper
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
    return VectorMapper(*this, i, j);
  }
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
    return Base::m_tensor;
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -18,207 +18,252 @@
 namespace Eigen {
 /** \class TensorConvolution
-  * \ingroup CXX11_Tensor_Module
+ * \ingroup CXX11_Tensor_Module
-  *
+ *
-  * \brief Tensor convolution class.
+ * \brief Tensor convolution class.
-  *
+ *
-  *
+ *
-  */
+ */
 template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
 typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
 struct EigenConvolutionKernel1D{
 typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
 internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
 Kernel_accessor kernel_filter;
 const size_t kernelSize, range_x, range_y;
 Buffer_accessor buffer_acc;
 ptrdiff_t out_offset;
 Local_accessor local_acc;
 FunctorExpr functors;
 TupleType tuple_of_accessors;
 EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
  Kernel_accessor kernel_filter_,  const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
  buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
 enum class convolution_type { CONV1D, CONV2D, CONV3D };
 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
 struct EigenConvolutionKernel;
 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
          typename Kernel_accessor, typename Buffer_accessor>
 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
                              Buffer_accessor, convolution_type::CONV1D> {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      Local_accessor;
  Local_accessor local_acc;
  Evaluator device_evaluator;
  Kernel_accessor kernel_filter;
  Buffer_accessor buffer_acc;
  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
  const size_t kernelSize;
  const cl::sycl::range<2> input_range;
  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
                         Buffer_accessor buffer_acc_,
                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
      : local_acc(local_acc_),
        device_evaluator(device_evaluator_),
        kernel_filter(kernel_filter_),
        buffer_acc(buffer_acc_),
        indexMapper(indexMapper_),
        kernelSize(kernelSize_),
        input_range(input_range_) {}
  template <typename BooleanDim2>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
    return (boolean_check[0] && boolean_check[1]);
  }
  void operator()(cl::sycl::nd_item<2> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+    auto buffer_ptr = buffer_acc.get_pointer();
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    auto kernel_ptr = kernel_filter.get_pointer();
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
+    // the required row to be calculated for the for each plane in shered memory
-
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
-
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
    const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
    const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
    const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
    /// fill the shared memory
-    for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
-      const size_t local_index = i + plane_kernel_offset ;
+      const size_t local_index = i + plane_kernel_offset;
-      const size_t tensor_index  =  plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
+      const size_t tensor_index =
-      if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
-        local_acc[local_index] = device_evaluator.coeff(tensor_index);
+
-      }
+      local_acc[local_index] =
-      else local_acc[local_index]=0.0f;
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
              ? device_evaluator.coeff(tensor_index)
              : CoeffReturnType(0);
    }
    itemID.barrier(cl::sycl::access::fence_space::local_space);
-    // calculate the convolution
+    // calculate the convolution // output start x
-    const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
-    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
+    if (boundary_check(itemID.get_global_id() < input_range)) {
      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      const size_t index = plane_kernel_offset+ itemID.get_local(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
      for (size_t k = 0; k < kernelSize; ++k) {
        result += (local_acc[k + index] * kernel_ptr[k]);
      }
-      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
+      const size_t tensor_index =
-      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
-      buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
      buffer_ptr[tensor_index] = result;
    }
  }
 };
-
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+          typename Kernel_accessor, typename Buffer_accessor>
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-struct EigenConvolutionKernel2D{
+                              Buffer_accessor, convolution_type::CONV2D> {
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+      Local_accessor;
-Kernel_accessor kernel_filter;
+  Local_accessor local_acc;
-const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
+  Evaluator device_evaluator;
-Buffer_accessor buffer_acc;
+  Kernel_accessor kernel_filter;
-ptrdiff_t out_offset;
+  Buffer_accessor buffer_acc;
-Local_accessor local_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
-FunctorExpr functors;
+  const cl::sycl::range<2> kernel_size;
-TupleType tuple_of_accessors;
+  const cl::sycl::range<3> input_range;
-EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
+                         Buffer_accessor buffer_acc_,
-  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
-  buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+      : local_acc(local_acc_),
        device_evaluator(device_evaluator_),
        kernel_filter(kernel_filter_),
        buffer_acc(buffer_acc_),
        indexMapper(indexMapper_),
        kernel_size(kernel_size_),
        input_range(input_range_) {}
  template <typename BooleanDim3>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
  }
  void operator()(cl::sycl::nd_item<3> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+    auto buffer_ptr = buffer_acc.get_pointer();
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    auto kernel_ptr = kernel_filter.get_pointer();
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
+    // the required row to be calculated for the for each plane in shered memory
    const auto num_input = cl::sycl::range<2>{
        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
    const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
    const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
-    /// fill the shared memory
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
-    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
-    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
+      
-    for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
+    // fill the local memory
-      const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
-      for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
        const size_t local_index = i + local_input_offset;
-        const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
-        if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
+                                                             i + input_offset[0], j + input_offset[1]);
-          local_acc[local_index] = device_evaluator.coeff(tensor_index);
+        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
-        }
+                                  in_range_dim1 && in_range_dim2)
-        else local_acc[local_index]=0.0f;
+                                     ? device_evaluator.coeff(tensor_index)
                                     : CoeffReturnType(0);
      }
    }
  }
    itemID.barrier(cl::sycl::access::fence_space::local_space);
-    // calculate the convolution
+    // output offset start for each thread
-    const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
-    const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
-    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+
    if (boundary_check(itemID.get_global_id() < input_range)) {
      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      for (size_t j = 0; j < kernelSize_y; j++) {
+
-        size_t kernel_offset =kernelSize_x * j;
+      for (size_t j = 0; j < kernel_size[1]; j++) {
-        const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
+        size_t kernel_offset = kernel_size[0] * j;
-        for (size_t i = 0; i < kernelSize_x; i++) {
+        const size_t index =
-        result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
        for (size_t i = 0; i < kernel_size[0]; i++) {
          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
        }
      }
-      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
+      const size_t tensor_index =
-      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
-      buffer_ptr[tensor_index  +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
                                                             itemID.get_local_id(1) + output_offset[1]);
      buffer_ptr[tensor_index] = result;
    }
  }
 };
 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
          typename Kernel_accessor, typename Buffer_accessor>
 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
                              Buffer_accessor, convolution_type::CONV3D> {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      Local_accessor;
  Local_accessor local_acc;
  Evaluator device_evaluator;
  Kernel_accessor kernel_filter;
  Buffer_accessor buffer_acc;
  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
  const cl::sycl::range<3> kernel_size;
  const cl::sycl::range<3> input_range;
  const size_t numP;
-
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
+                         Buffer_accessor buffer_acc_,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
-struct EigenConvolutionKernel3D{
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+                         const size_t numP_)
-internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
+      : local_acc(local_acc_),
-Kernel_accessor kernel_filter;
+        device_evaluator(device_evaluator_),
-const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
+        kernel_filter(kernel_filter_),
-Buffer_accessor buffer_acc;
+        buffer_acc(buffer_acc_),
-ptrdiff_t out_offset;
+        indexMapper(indexMapper_),
-Local_accessor local_acc;
+        kernel_size(kernel_size_),
-FunctorExpr functors;
+        input_range(input_range_),
-TupleType tuple_of_accessors;
+        numP(numP_) {}
-EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
+  template <typename BooleanDim3>
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
-  const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
-  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  }
  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
  kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
  buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
  void operator()(cl::sycl::nd_item<3> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
+    auto buffer_ptr = buffer_acc.get_pointer();
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+    auto kernel_ptr = kernel_filter.get_pointer();
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
+
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
+    const auto output_offset =
-    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
+          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
-    const size_t num_z_input =  (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
+
-    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
+    for (size_t p = 0; p < numP; p++) {
    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
    const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
    for(size_t p=0; p<numP; p++){
      /// fill the shared memory
-      const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
-      for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
-        for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
-          for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
-            const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
-            const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
-            if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) &&  ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
+          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
-              local_acc[local_index] = device_evaluator.coeff(tensor_index);
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
-            }
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
-            else local_acc[local_index]=0.0f;
+            const size_t local_index = local_index_dim1 + i;
            const size_t tensor_index =
                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
          }
        }
      }
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      // calculate the convolution
      const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
      const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
      const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
-      if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
+      if (boundary_check(itemID.get_global_id() < input_range)) {
        CoeffReturnType result = static_cast<CoeffReturnType>(0);
-        for (size_t k = 0; k < kernelSize_z; k++) {
+        for (size_t k = 0; k < kernel_size[2]; k++) {
-          for (size_t j = 0; j < kernelSize_y; j++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
-            for (size_t i = 0; i < kernelSize_x; i++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
-              const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
-              const size_t local_index = ((i+ itemID.get_local(0))+  num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
+              const size_t local_index =
                  ((i + itemID.get_local_id(0)) +
                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
            }
          }
        }
-        const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
+        const size_t tensor_index =
-        +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
-        buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
        buffer_ptr[tensor_index] = result;
      }
      itemID.barrier(cl::sycl::access::fence_space::local_space);
@ -226,25 +271,32 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
  }
 };
-
+template <typename Indices, typename InputArgType, typename KernelArgType>
-template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
 {
  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
+  static const int NumDims =
      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
  static const int NumKernelDims = internal::array_size<Indices>::value;
  typedef typename XprType::Index Index;
  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
  typedef const Eigen::SyclDevice Device;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
  typedef typename InputArgType::Scalar Scalar;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
  enum {
-    IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
    PacketAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
+    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };
@ -253,13 +305,22 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  typedef internal::TensorBlockNotImplemented TensorBlockV2;
  //===--------------------------------------------------------------------===//
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+      : m_inputImpl(op.inputExpression(), device),
-  {
+        m_kernelArg(op.kernelExpression()),
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+        m_kernelImpl(op.kernelExpression(), device),
        m_indices(op.indices()),
        m_buf(NULL),
        m_kernel(NULL),
        m_local_kernel(false),
        m_device(device) {
    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
        m_kernelImpl.dimensions();
    m_dimensions = m_inputImpl.dimensions();
    for (int i = 0; i < NumKernelDims; ++i) {
@ -271,21 +332,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    }
  }
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
  typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
  typedef typename InputArgType::Scalar Scalar;
  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
    preloadKernel();
    m_inputImpl.evalSubExprsIfNeeded(NULL);
    if (data) {
      executeEval(data);
      return false;
    } else {
-      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      m_buf = (EvaluatorPointerType)m_device.get(
          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
      executeEval(m_buf);
      return true;
    }
@ -294,194 +351,194 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_inputImpl.cleanup();
    if (m_buf) {
-      m_device.deallocate(m_buf);
+      m_device.deallocate_temp(m_buf);
      m_buf = NULL;
    }
    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
+      m_device.deallocate_temp(m_kernel);
      m_local_kernel = false;
    }
    m_kernel = NULL;
  }
  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_buf; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
    // Don't make a local copy of the kernel unless we have to (i.e. it's an
    // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
    if (in_place) {
      m_kernel = in_place;
      m_local_kernel = false;
    } else {
      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
      m_kernel = local;
      m_local_kernel = true;
    }
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  void executeEval(Scalar* data) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
-    typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
    typedef typename InputEvaluator::Dimensions InputDims;
    switch (NumKernelDims) {
      case 1: {
        const size_t numX = dimensions()[m_indices[0]];
        const size_t numP = dimensions().TotalSize() / numX;
        const auto input_dim = std::array<size_t, 2>{numX, numP};
        auto global_range = cl::sycl::range<2>{};
        auto local_range = cl::sycl::range<2>{};
        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
-    // extract input functor list
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
-    InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-    ptrdiff_t out_offset = m_device.get_offset(data);
+        const array<Index, 1> indices{{m_indices[0]}};
        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
            ConvKernel;
-    m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
-      typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
+            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
-      /// work-around for gcc 4.8 auto bug
+        break;
      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
      // create input tuple of accessors
      InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType;
      OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data);
      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
      KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
      switch (NumKernelDims) {
        case 1: {
          const size_t numX = dimensions()[m_indices[0]];
          const size_t numP = dimensions().TotalSize() / numX;
          const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
          m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
          const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
          auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
          auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
          const array<Index, 1> indices{{m_indices[0]}};
          const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
          internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
          cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
          EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
          indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
          break;
        }
        case 2: {
          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
          const size_t numX = dimensions()[m_indices[idxX]];
          const size_t numY = dimensions()[m_indices[idxY]];
          const size_t numP = dimensions().TotalSize() / (numX*numY);
          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
          m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
          const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
          const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
          internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
          EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
          break;
        }
        case 3: {
          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
          const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
          const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
          const size_t numX = dimensions()[m_indices[idxX]];
          const size_t numY = dimensions()[m_indices[idxY]];
          const size_t numZ = dimensions()[m_indices[idxZ]];
          const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
          const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
          const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
          internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
          m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
          EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, kernel_size_z, numX, numY,
          numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
          break;
        }
        default: {
          EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
        }
      }
-    });
+
-    m_device.asynchronousExec();
+      case 2: {
        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
        const size_t numP = dimensions().TotalSize() / (numX * numY);
        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
        auto global_range = cl::sycl::range<3>{};
        auto local_range = cl::sycl::range<3>{};
        m_device.parallel_for_setup(input_dim, global_range, local_range);
        const size_t local_memory_size =
            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
        const array<Index, 2> kernel_dims{
            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
            ConvKernel;
        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
        break;
      }
      case 3: {
        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
        const array<Index, 3> indices{
            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
                                           m_kernelImpl.dimensions()[kernel_index[1]],
                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
        auto global_range = cl::sycl::range<3>{};
        auto local_range = cl::sycl::range<3>{};
        m_device.parallel_for_setup(input_dim, global_range, local_range);
        auto local_memory_range = (local_range + kernel_size - 1);
        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
            ConvKernel;
        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
        break;
      }
      default: {
        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
      }
    }
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-  {
+    eigen_assert(m_buf != NULL);
    eigen_assert(m_buf);
    eigen_assert(index < m_dimensions.TotalSize());
    return m_buf[index];
  }
-  template<int LoadMode>
+  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
-  {
+    eigen_assert(m_buf != NULL);
    eigen_assert(m_buf);
    eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
  costPerCoeff(bool vectorized) const {
    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
    // model.
    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
    const double firstIndex_compute_cost =
        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
         TensorOpCost::DivCost<Index>());
    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+  }
-                                       PacketSize));
+  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_kernelImpl.bind(cgh);
    m_inputImpl.bind(cgh);
    m_buf.bind(cgh);
    m_kernel.bind(cgh);
  }
 private:
  // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
+  TensorEvaluator &operator=(const TensorEvaluator &);
-  TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
  KernelArgType m_kernelArg;
-  TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
  Indices m_indices;
  Dimensions m_dimensions;
-  Scalar* m_buf;
+  EvaluatorPointerType m_buf;
-  const Scalar* m_kernel;
+  typename KernelStorage::Type m_kernel;
  bool m_local_kernel;
-  const Eigen::SyclDevice& m_device;
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
-};
+};  // namespace Eigen
-} // end namespace Eigen
+}  // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -16,7 +16,6 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
 #include <unordered_set>
 namespace Eigen {
 namespace TensorSycl {
@ -70,9 +69,9 @@ struct SyclDeviceInfo {
 }  // end namespace TensorSycl
 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and 
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently 
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
-// TensorFlow via the Eigen SYCL Backend. 
+// TensorFlow via the Eigen SYCL Backend.
 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
    -> decltype(cl::sycl::device::get_devices()) {
 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
@ -421,6 +420,91 @@ class QueueInterface {
    return pMapper.get_offset(ptr);
  }
  template <typename OutScalar, typename sycl_kernel, typename Lhs,
            typename Rhs, typename OutPtr, typename Range, typename Index,
            typename... T>
  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
                                                  const Rhs &rhs, OutPtr outptr,
                                                  Range thread_range,
                                                  Index scratchSize,
                                                  T... var) const {
    auto kernel_functor = [=](cl::sycl::handler &cgh) {
      // binding the placeholder accessors to a commandgroup handler
      lhs.bind(cgh);
      rhs.bind(cgh);
      outptr.bind(cgh);
      typedef cl::sycl::accessor<OutScalar, 1,
                                 cl::sycl::access::mode::read_write,
                                 cl::sycl::access::target::local>
          LocalAccessor;
      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
      cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
          program().template get_kernel<sycl_kernel>(),
 #endif
          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
    };
    cl::sycl::event e;
    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
    async_synchronize(e);
  }
  template <typename OutScalar, typename sycl_kernel, typename InPtr,
            typename OutPtr, typename Range, typename Index, typename... T>
  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
                                                 OutPtr &outptr,
                                                 Range thread_range,
                                                 Index scratchSize,
                                                 T... var) const {
    auto kernel_functor = [=](cl::sycl::handler &cgh) {
      // binding the placeholder accessors to a commandgroup handler
      inptr.bind(cgh);
      outptr.bind(cgh);
      typedef cl::sycl::accessor<OutScalar, 1,
                                 cl::sycl::access::mode::read_write,
                                 cl::sycl::access::target::local>
          LocalAccessor;
      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
      cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
          program().template get_kernel<sycl_kernel>(),
 #endif
          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
    };
    cl::sycl::event e;
    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
    async_synchronize(e);
  }
    template <typename OutScalar, typename sycl_kernel, typename InPtr,
           typename Range, typename Index, typename... T>
  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
                                                 Range thread_range,
                                                 Index scratchSize,
                                                 T... var) const {
    auto kernel_functor = [=](cl::sycl::handler &cgh) {
      // binding the placeholder accessors to a commandgroup handler
      inptr.bind(cgh);
      typedef cl::sycl::accessor<OutScalar, 1,
                                 cl::sycl::access::mode::read_write,
                                 cl::sycl::access::target::local>
          LocalAccessor;
      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
      cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
          program().template get_kernel<sycl_kernel>(),
 #endif
          thread_range, sycl_kernel(scratch, inptr, var...));
    };
    cl::sycl::event e;
    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
    async_synchronize(e);
  }
  EIGEN_STRONG_INLINE void synchronize() const {
 #ifdef EIGEN_EXCEPTIONS
    m_queue.wait_and_throw();
@ -429,6 +513,7 @@ class QueueInterface {
 #endif
  }
  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
    set_latest_event(e);
 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
@ -457,11 +542,10 @@ class QueueInterface {
  /// This is used to prepare the number of threads and also the number of
  /// threads per block for sycl kernels
  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
+  EIGEN_STRONG_INLINE void parallel_for_setup(
-                                              Index &tileSize0,
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
-                                              Index &tileSize1, Index &rng0,
+      cl::sycl::range<2> &local_range) const {
-                                              Index &rng1, Index &GRange0,
+    std::array<Index, 2> input_range = input_dim;
                                              Index &GRange1) const {
    Index max_workgroup_Size =
        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
    max_workgroup_Size =
@ -469,26 +553,28 @@ class QueueInterface {
                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
                 static_cast<Index>(max_workgroup_Size));
    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize1 =
+    local_range[1] =
        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    rng1 = dim1;
+    input_range[1] = input_dim[1];
-    if (rng1 == 0) rng1 = static_cast<Index>(1);
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
-    GRange1 = rng1;
+    global_range[1] = input_range[1];
-    if (tileSize1 > GRange1)
+    if (local_range[1] > global_range[1])
-      tileSize1 = GRange1;
+      local_range[1] = global_range[1];
-    else if (GRange1 > tileSize1) {
+    else if (global_range[1] > local_range[1]) {
-      Index xMode = static_cast<Index>(GRange1 % tileSize1);
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+      if (xMode != 0)
        global_range[1] += static_cast<Index>(local_range[1] - xMode);
    }
-    tileSize0 = static_cast<Index>(max_workgroup_Size / tileSize1);
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
-    rng0 = dim0;
+    input_range[0] = input_dim[0];
-    if (rng0 == 0) rng0 = static_cast<Index>(1);
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
-    GRange0 = rng0;
+    global_range[0] = input_range[0];
-    if (tileSize0 > GRange0)
+    if (local_range[0] > global_range[0])
-      tileSize0 = GRange0;
+      local_range[0] = global_range[0];
-    else if (GRange0 > tileSize0) {
+    else if (global_range[0] > local_range[0]) {
-      Index xMode = static_cast<Index>(GRange0 % tileSize0);
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+      if (xMode != 0)
        global_range[0] += static_cast<Index>(local_range[0] - xMode);
    }
  }
@ -496,9 +582,9 @@ class QueueInterface {
  /// threads per block for sycl kernels
  template <typename Index>
  EIGEN_STRONG_INLINE void parallel_for_setup(
-      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
-      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
+      cl::sycl::range<3> &local_range) const {
-      Index &GRange1, Index &GRange2) const {
+    std::array<Index, 3> input_range = input_dim;
    Index max_workgroup_Size =
        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
    max_workgroup_Size =
@ -506,45 +592,48 @@ class QueueInterface {
                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
                 static_cast<Index>(max_workgroup_Size));
    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize2 =
+    local_range[2] =
        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
-    rng2 = dim2;
+    input_range[2] = input_dim[2];
-    if (rng2 == 0) rng1 = static_cast<Index>(1);
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
-    GRange2 = rng2;
+    global_range[2] = input_range[2];
-    if (tileSize2 > GRange2)
+    if (local_range[2] > global_range[2])
-      tileSize2 = GRange2;
+      local_range[2] = global_range[2];
-    else if (GRange2 > tileSize2) {
+    else if (global_range[2] > local_range[2]) {
-      Index xMode = static_cast<Index>(GRange2 % tileSize2);
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
-      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+      if (xMode != 0)
        global_range[2] += static_cast<Index>(local_range[2] - xMode);
    }
    pow_of_2 = static_cast<Index>(
-        std::log2(static_cast<Index>(max_workgroup_Size / tileSize2)));
+        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
-    tileSize1 =
+    local_range[1] =
        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    rng1 = dim1;
+    input_range[1] = input_dim[1];
-    if (rng1 == 0) rng1 = static_cast<Index>(1);
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
-    GRange1 = rng1;
+    global_range[1] = input_range[1];
-    if (tileSize1 > GRange1)
+    if (local_range[1] > global_range[1])
-      tileSize1 = GRange1;
+      local_range[1] = global_range[1];
-    else if (GRange1 > tileSize1) {
+    else if (global_range[1] > local_range[1]) {
-      Index xMode = static_cast<Index>(GRange1 % tileSize1);
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+      if (xMode != 0)
        global_range[1] += static_cast<Index>(local_range[1] - xMode);
    }
-    tileSize0 =
+    local_range[0] = static_cast<Index>(max_workgroup_Size /
-        static_cast<Index>(max_workgroup_Size / (tileSize1 * tileSize2));
+                                        (local_range[1] * local_range[2]));
-    rng0 = dim0;
+    input_range[0] = input_dim[0];
-    if (rng0 == 0) rng0 = static_cast<Index>(1);
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
-    GRange0 = rng0;
+    global_range[0] = input_range[0];
-    if (tileSize0 > GRange0)
+    if (local_range[0] > global_range[0])
-      tileSize0 = GRange0;
+      local_range[0] = global_range[0];
-    else if (GRange0 > tileSize0) {
+    else if (global_range[0] > local_range[0]) {
-      Index xMode = static_cast<Index>(GRange0 % tileSize0);
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+      if (xMode != 0)
        global_range[0] += static_cast<Index>(local_range[0] - xMode);
    }
  }
  EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCA_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
    return false;
 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
    return true;
@ -768,25 +857,19 @@ struct SyclDevice : public SyclDeviceBase {
  /// This is used to prepare the number of threads and also the number of
  /// threads per block for sycl kernels
  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,
+  EIGEN_STRONG_INLINE void parallel_for_setup(
-                                              Index &tileSize0,
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
-                                              Index &tileSize1, Index &rng0,
+      cl::sycl::range<2> &local_range) const {
-                                              Index &rng1, Index &GRange0,
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
                                              Index &GRange1) const {
    queue_stream()->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0,
                                       rng1, GRange0, GRange1);
  }
  /// This is used to prepare the number of threads and also the number of
  /// threads per block for sycl kernels
  template <typename Index>
  EIGEN_STRONG_INLINE void parallel_for_setup(
-      Index dim0, Index dim1, Index dim2, Index &tileSize0, Index &tileSize1,
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
-      Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0,
+      cl::sycl::range<3> &local_range) const {
-      Index &GRange1, Index &GRange2) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
    queue_stream()->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1,
                                       tileSize2, rng0, rng1, rng2, GRange0,
                                       GRange1, GRange2);
  }
  /// allocate device memory
@ -943,6 +1026,22 @@ struct SyclDevice : public SyclDeviceBase {
  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
    return queue_stream()->getDeviceVendor();
  }
  template <typename OutScalar, typename KernelType, typename... T>
  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
        var...);
  }
  template <typename OutScalar, typename KernelType, typename... T>
  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
        var...);
  }
  template <typename OutScalar, typename KernelType, typename... T>
  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
        var...);
  }
 };
 }  // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -649,131 +649,75 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
-template <bool Vectorizable, typename Evaluator>
+template <typename Evaluator>
-struct ExecExprFunctorKernel_impl {
+struct ExecExprFunctorKernel {
  typedef typename Evaluator::Index Index;
  const Index range;
  const Index vectorizable_threads;
  Evaluator evaluator;
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
+  const Index range;
-      const Index range_, const Index vectorizable_threads_,
+  template <typename Scratch>
-      Evaluator evaluator_)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
-      : range(range_), vectorizable_threads(vectorizable_threads_),
+      const Scratch, Evaluator evaluator_, const Index range_)
-        evaluator(evaluator_) {}
+      : evaluator(evaluator_), range(range_) {}
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
-  operator()(cl::sycl::nd_item<1> itemID) {
+      cl::sycl::nd_item<1> itemID) {
    compute(itemID);
  }
  template <bool is_vec = Evaluator::PacketAccess>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
  compute(const cl::sycl::nd_item<1>& itemID) {
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
    Index total_threads = itemID.get_global_range(0);
-    EIGEN_UNROLL_LOOP
+
    for (Index i = gId; i < range; i += total_threads) {
      evaluator.evalScalar(i);
    }
  }
  template <bool is_vec = Evaluator::PacketAccess>
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
  compute(const cl::sycl::nd_item<1>& itemID) {
    const Index vectorizedRange =
        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
    const Index start = Evaluator::PacketSize * gId;
    for (Index i = start; i < vectorizedRange; i += step) {
      evaluator.evalPacket(i);
    }
    gId += vectorizedRange;
    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
      evaluator.evalScalar(i);
    }
  }
 };
 template <typename Evaluator>
 struct ExecExprFunctorKernel_impl<true, Evaluator> {
  typedef typename Evaluator::Index Index;
  const Index range;
  const Index vectorizable_threads;
  Evaluator evaluator;
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
      const Index range_, const Index vectorizable_threads_,
      Evaluator evaluator_)
      : range(range_), vectorizable_threads(vectorizable_threads_),
        evaluator(evaluator_) {}
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
  operator()(cl::sycl::nd_item<1> itemID) {
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
    if (gId < vectorizable_threads) {
      const Index PacketSize = Eigen::internal::unpacket_traits<
          typename Evaluator::PacketReturnType>::size;
      evaluator.evalPacket(gId * PacketSize);
      gId += (vectorizable_threads * PacketSize);
      EIGEN_UNROLL_LOOP
      for (Index i = gId; i < range; i += vectorizable_threads) {
        evaluator.evalScalar(i);
      }
    }
  }
 };
 template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator>
 struct ExecExprFunctorKernel
    : ExecExprFunctorKernel_impl<
          ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
          Evaluator> {
  ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
                        const Evaluator &evaluator)
      : ExecExprFunctorKernel_impl<
            ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
            Evaluator>(range_, vectorizable_threads_, evaluator) {}
 };
 template <typename Expr, typename Evaluator>
 struct ExecExprFunctorKernel<Expr, false, Evaluator>
    : ExecExprFunctorKernel_impl<false, Evaluator> {
  ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
                        const Evaluator &evaluator)
      : ExecExprFunctorKernel_impl<false, Evaluator>(
            range_, vectorizable_threads_, evaluator) {}
 };
 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
-  public:
+ public:
  typedef typename Expression::Index Index;
-   static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-    Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
+                                      const Eigen::SyclDevice& dev) {
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
    Evaluator evaluator(expr, dev);
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign) {
      Index range, GRange, tileSize;
      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
      total_size = (total_size == 0) ? 1 : total_size;
-      const int PacketSize = Eigen::PacketType<
+      const int PacketSize =
-          typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType,
+          Eigen::PacketType<typename Evaluator::CoeffReturnType,
-          Eigen::SyclDevice>::size;
+                            Eigen::SyclDevice>::size;
-      Index vectorizable_threads =
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
          static_cast<Index>(total_size / PacketSize);
      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
      range = total_size;
      auto f = [&](cl::sycl::handler &cgh) {
        evaluator.bind(cgh);
        typedef ExecExprFunctorKernel<Expression, true,
                                      Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
            conditional_vectorized_kernel;
-        typedef ExecExprFunctorKernel<Expression, false,
+      dev.template nullary_kernel_launcher<
-                                      Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
+          typename Evaluator::CoeffReturnType,
-            non_vectorized_kernel;
+          ExecExprFunctorKernel<Evaluator> >(
-// This is to make sure that an expression with a size less than vectorized size
+          evaluator,
-// will not call the vectorized kernel.
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
-// The reason for having this kernel is that the vectorisable parameter is a
+                                cl::sycl::range<1>(tileSize)),
-// compile-time parameter,
+          Index(1), range);
 // however, the size of a tensor is a run-time parameter
        (vectorizable_threads)
            ? cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
                  dev.program().template get_kernel<vectorized_kernel>(),
 #endif
                  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
                                        cl::sycl::range<1>(tileSize)),
                  conditional_vectorized_kernel(range, vectorizable_threads,
                                                evaluator))
            : cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
                  dev.program().template get_kernel<non_vectorized_kernel>(),
 #endif
                  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
                                        cl::sycl::range<1>(tileSize)),
                  non_vectorized_kernel(range, vectorizable_threads,
                                        evaluator));
      };
      cl::sycl::event e;
      EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f));
      dev.async_synchronize(e);
    }
    evaluator.cleanup();
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -123,7 +123,7 @@ struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
 namespace TensorSycl {
 namespace internal{
-template <typename Evaluator, typename Op> class ReductionFunctor;
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
 }
 }
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -421,7 +421,7 @@ template <typename Index, typename Device, bool BlockAccess> struct MemcpyTrigge
 #ifdef EIGEN_USE_GPU
 template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
 };
 #endif
@ -430,7 +430,7 @@ template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index
 #ifdef EIGEN_USE_SYCL
 template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { return contiguous > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
 };
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -946,7 +946,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #endif
 #if defined(EIGEN_USE_SYCL)
- template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor;
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
 // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
 template <typename, typename, typename> friend struct internal::GenericReducer;
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@ -11,167 +11,576 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
- * TensorSyclPlaceHolderExpr.h
+ * TensorReductionSycl.h
 *
 * \brief:
- *  This is the specialisation of the placeholder expression based on the
+ *  This is the specialization of the reduction operation. Two phase reduction approach 
- * operation type
+ * is used since the GPU does not have Global Synchronization for global memory among 
 * different work-group/thread block. To solve the problem, we need to create two kernels 
 * to reduce the data, where the first kernel reduce the data locally and each local 
 * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
 * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
 * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
 * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
 *
-*****************************************************************/
+ *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
-template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
-template<typename BufferTOut, typename BufferTIn>
+struct OpDefiner {
-static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
-  do {
+  typedef Op type;
-          auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                    cl::sycl::range<1>{std::min(length, local)}};
            /* Two accessors are used: one to the buffer that is being reduced,
             * and a second to local memory, used to store intermediate data. */
            auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
            auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h);
            typedef decltype(aI) InputAccessor;
            typedef decltype(aOut) OutputAccessor;
            typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
            LocalAccessor scratch(cl::sycl::range<1>(local), h);
            /* The parallel_for invocation chosen is the variant with an nd_item
             * parameter, since the code requires barriers for correctness. */
            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch,  length, local));
          };
            dev.sycl_queue().submit(f);
            dev.asynchronousExec();
          /* At this point, you could queue::wait_and_throw() to ensure that
           * errors are caught quickly. However, this would likely impact
           * performance negatively. */
          length = length / local;
        } while (length > 1);
 }
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
                                                                            const Index &) {
    return accumulator;
  }
 };
-template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
+template <typename CoeffReturnType, typename Index>
-template<typename BufferTOut, typename BufferTIn>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
-   syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
-    bufOut, out_offset, bufI, dev, length, local);
+    return type();
 }
 };
 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
 // a leafNode.
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
  typedef typename Self::CoeffReturnType CoeffReturnType;
  static const bool HasOptimizedImplementation = false;
  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
    size_t inputSize =self.impl().dimensions().TotalSize();
    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
    size_t remaining = inputSize% red_factor;
    if(rng ==0) {
      red_factor=1;
    };
    size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
    size_t GRange=std::max((size_t )1, rng);
    // convert global range to power of 2 for redecution
    GRange--;
    GRange |= GRange >> 1;
    GRange |= GRange >> 2;
    GRange |= GRange >> 4;
    GRange |= GRange >> 8;
    GRange |= GRange >> 16;
 #if __x86_64__ || __ppc64__ || _WIN64
    GRange |= GRange >> 32;
 #endif
    GRange++;
    size_t  outTileSize = tileSize;
    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
    if (GRange < outTileSize) outTileSize=GRange;
    /// creating the shared memory for calculating reduction.
    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
    /// recursively apply reduction on it in order to reduce the whole.
    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
  //  Dims dims= self.xprDims();
    //Op functor = reducer;
    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
      // this is a workaround for gcc 4.8 bug
      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
      // create a tuple of accessors from Evaluator
      TupleType tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
      typedef decltype(tmp_global_accessor) OutAccessor;
      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
        TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
       (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors,  tuple_of_accessors));
    });
    dev.asynchronousExec();
    // getting final out buffer at the moment the created buffer is true because there is no need for assign
    auto out_buffer =dev.get_sycl_buffer(output);
    ptrdiff_t out_offset = dev.get_offset(output);
    /// This is used to recursively reduce the tmp value to an element of 1;
    syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange,  outTileSize);
  }
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
                                                                           const Index &scale) {
    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
    return quotient_op(accumulator, CoeffReturnType(scale));
  }
 };
 template <typename CoeffReturnType, typename Index>
 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
    return type();
  }
-template <typename Self, typename Op>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
-struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+                                                                            const Index &scale) {
    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
  }
 };
 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
          Index local_range>
 struct SecondStepFullReducer {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
  typedef typename OpDef::type Op;
  LocalAccessor scratch;
  InputAccessor aI;
  OutputAccessor outAcc;
  Op op;
  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
  void operator()(cl::sycl::nd_item<1> itemID) {
    // Our empirical research shows that the best performance will be achieved
    // when there is only one element per thread to reduce in the second step.
    // in this step the second step reduction time is almost negligible.
    // Hence, in the second step of reduction the input size is fixed to the
    // local size, thus, there is only one element read per thread. The
    // algorithm must be changed if the number of reduce per thread in the
    // second step is greater than 1. Otherwise, the result will be wrong.
    const Index localid = itemID.get_local_id(0);
    auto aInPtr = aI.get_pointer() + localid;
    auto aOutPtr = outAcc.get_pointer();
    CoeffReturnType *scratchptr = scratch.get_pointer();
    CoeffReturnType accumulator = *aInPtr;
    scratchptr[localid] = op.finalize(accumulator);
 #pragma unroll 8
    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      if (localid < offset) {
        op.reduce(scratchptr[localid + offset], &accumulator);
        scratchptr[localid] = op.finalize(accumulator);
      }
    }
    if (localid == 0) *aOutPtr = op.finalize(accumulator);
  }
 };
 // Full reduction first phase. In this version the vectorization is true and the reduction accept 
 // any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
 class FullReductionKernelFunctor {
 public:
  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
  typedef typename Evaluator::Index Index;
  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
      OpDef;
  typedef typename OpDef::type Op;
  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
  typedef typename Evaluator::PacketReturnType PacketReturnType;
  typedef
      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
                                              PacketReturnType, CoeffReturnType>::type OutType;
  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  LocalAccessor scratch;
  Evaluator evaluator;
  EvaluatorPointerType final_output;
  Index rng;
  Op op;
  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
                             Index rng_, OpType op_)
      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
      const cl::sycl::nd_item<1> &itemID) {
    auto output_ptr = final_output.get_pointer();
    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
    Index globalid = itemID.get_global_id(0);
    Index localid = itemID.get_local_id(0);
    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
    Index start = Evaluator::PacketSize * globalid;
    // vectorizable parts
    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
 #pragma unroll(8 / Evaluator::PacketSize)
    for (Index i = start; i < VectorizedRange; i += step) {
      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
    }
    globalid += VectorizedRange;
    // non vectorizable parts
    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
      op.template reducePacket<PacketReturnType>(
          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
              evaluator.impl().coeff(i), op.initialize()),
          &packetAccumulator);
    }
    scratch[localid] = packetAccumulator =
        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
    // reduction parts // Local size is always power of 2
    EIGEN_UNROLL_LOOP
    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      if (localid < offset) {
        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
      }
    }
    if (localid == 0) {
      output_ptr[itemID.get_group(0)] =
          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
    }
  }
  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
      const cl::sycl::nd_item<1> &itemID) {
    auto output_ptr = final_output.get_pointer();
    Index globalid = itemID.get_global_id(0);
    Index localid = itemID.get_local_id(0);
    // vectorizable parts
    CoeffReturnType accumulator = op.initialize();
    // non vectorizable parts
    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
      op.reduce(evaluator.impl().coeff(i), &accumulator);
    }
    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
    // reduction parts. the local size is always power of 2
    EIGEN_UNROLL_LOOP
    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      if (localid < offset) {
        op.reduce(scratch[localid + offset], &accumulator);
        scratch[localid] = op.finalize(accumulator);
      }
    }
    if (localid == 0) {
      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
    }
  }
 };
 template <typename Evaluator, typename OpType>
 class GenericNondeterministicReducer {
 public:
  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
  typedef typename Evaluator::Index Index;
  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
  typedef typename OpDef::type Op;
  template <typename Scratch>
  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
                       Index range_, Index num_values_to_reduce_)
      : evaluator(evaluator_),
        output_accessor(output_accessor_),
        functor(OpDef::get_op(functor_)),
        range(range_),
        num_values_to_reduce(num_values_to_reduce_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {
    auto output_accessor_ptr = output_accessor.get_pointer();
    /// const cast added as a naive solution to solve the qualifier drop error
    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
    if (globalid < range) {
      CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
          evaluator, evaluator.firstInput(globalid), functor, &accum);
      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
    }
  }
 private:
  Evaluator evaluator;
  EvaluatorPointerType output_accessor;
  Op functor;
  Index range;
  Index num_values_to_reduce;
 };
 enum class reduction_dim { inner_most, outer_most };
 // default is preserver
 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
 struct PartialReductionKernel {
  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
  typedef typename Evaluator::Index Index;
  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
  typedef typename OpDef::type Op;
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      ScratchAcc;
  ScratchAcc scratch;
  Evaluator evaluator;
  EvaluatorPointerType output_accessor;
  Op op;
  const Index preserve_elements_num_groups;
  const Index reduce_elements_num_groups;
  const Index num_coeffs_to_preserve;
  const Index num_coeffs_to_reduce;
  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
      : scratch(scratch_),
        evaluator(evaluator_),
        output_accessor(output_accessor_),
        op(OpDef::get_op(op_)),
        preserve_elements_num_groups(preserve_elements_num_groups_),
        reduce_elements_num_groups(reduce_elements_num_groups_),
        num_coeffs_to_preserve(num_coeffs_to_preserve_),
        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
                                                                 CoeffReturnType &accumulator) {
    if (globalPId >= num_coeffs_to_preserve) {
      return;
    }
    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
    Index localOffset = globalRId;
    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
    const Index per_thread_global_stride =
        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
 #pragma unroll 8
    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
      localOffset += per_thread_local_stride;
      global_offset += per_thread_global_stride;
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    const Index linearLocalThreadId = itemID.get_local_id(0);
    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
                                                           : itemID.get_group(0) / reduce_elements_num_groups;
    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
                                                           : itemID.get_group(0) % reduce_elements_num_groups;
    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
    auto scratchPtr = scratch.get_pointer().get();
    auto outPtr =
        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
    CoeffReturnType accumulator = op.initialize();
    element_wise_reduce(globalRId, globalPId, accumulator);
    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
        accumulator;
    if (rt == reduction_dim::inner_most) {
      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
    }
    /* Apply the reduction operation between the current local
     * id and the one on the other half of the vector. */
    auto out_scratch_ptr =
        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
    itemID.barrier(cl::sycl::access::fence_space::local_space);
    if (rt == reduction_dim::inner_most) {
      accumulator = *out_scratch_ptr;
    }
    // The Local LocalThreadSizeR is always power of 2
    EIGEN_UNROLL_LOOP
    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
      if (rLocalThreadId < offset) {
        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
        // The result has already been divided for mean reducer in the
        // previous reduction so no need to divide furthermore
        *out_scratch_ptr = op.finalize(accumulator);
      }
      /* All threads collectively read from global memory into local.
       * The barrier ensures all threads' IO is resolved before
       * execution continues (strictly speaking, all threads within
       * a single work-group - there is no co-ordination between
       * work-groups, only work-items). */
      itemID.barrier(cl::sycl::access::fence_space::local_space);
    }
    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
      outPtr[globalPId] = op.finalize(accumulator);
    }
  }
 };
 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
 struct SecondStepPartialReduction {
  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
  typedef typename OpDef::type Op;
  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      ScratchAccessor;
  InputAccessor input_accessor;
  OutputAccessor output_accessor;
  Op op;
  const Index num_coeffs_to_preserve;
  const Index num_coeffs_to_reduce;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
                                                                   OutputAccessor output_accessor_, OpType op_,
                                                                   const Index num_coeffs_to_preserve_,
                                                                   const Index num_coeffs_to_reduce_)
      : input_accessor(input_accessor_),
        output_accessor(output_accessor_),
        op(OpDef::get_op(op_)),
        num_coeffs_to_preserve(num_coeffs_to_preserve_),
        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    const Index globalId = itemID.get_global_id(0);
    if (globalId >= num_coeffs_to_preserve) return;
    auto in_ptr = input_accessor.get_pointer() + globalId;
    OutScalar accumulator = op.initialize();
 // num_coeffs_to_reduce is not bigger that 256
 #pragma unroll 8
    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
      op.reduce(*in_ptr, &accumulator);
      in_ptr += num_coeffs_to_preserve;
    }
    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
  }
 };  // namespace internal
 template <typename Index, Index LTP, Index LTR, bool BC_>
 struct ReductionPannel {
  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
  static EIGEN_CONSTEXPR bool BC = BC_;
 };
 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
 struct PartialReducerLauncher {
  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
  typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
+  typedef typename Self::Storage Storage;
  typedef typename Self::Index Index;
  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
      PannelParameters;
-  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
-    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
    // getPowerOfTwo makes sure local range is power of 2 and <=
    // maxSyclThreadPerBlock this will help us to avoid extra check on the
    // kernel
    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
                  "The Local thread size must be a power of 2 for the reduction "
                  "operation");
    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
    // In this step, we force the code not to be more than 2-step reduction:
    // Our empirical research shows that if each thread reduces at least 64
    // elemnts individually, we get better performance. However, this can change
    // on different platforms. In this step we force the code not to be
    // morthan step reduction: Our empirical research shows that for inner_most
    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
    // > 1024 to achieve the best performance.
    const Index reductionPerThread = 64;
    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
    const Index globalRange = pNumGroups * rNumGroups * localRange;
    EIGEN_CONSTEXPR Index scratchSize =
        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
    if (rNumGroups > 1) {
      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
          num_coeffs_to_reduce);
      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
          SecondStepPartialReductionKernel;
      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
          temp_accessor, output,
          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
          reducer, num_coeffs_to_preserve, rNumGroups);
      self.device().deallocate_temp(temp_pointer);
    } else {
      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
          num_coeffs_to_reduce);
    }
    return false;
  }
 };
 }  // namespace internal
 }  // namespace TensorSycl
 namespace internal {
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
  typedef typename Self::CoeffReturnType CoeffReturnType;
  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
                  "The Local thread size must be a power of 2 for the reduction "
                  "operation");
    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
    // In this step we force the code not to be more than 2-step reduction:
    // Our empirical research shows that if each thread reduces at least 512
    // elemnts individually, we get better performance.
    const Index reductionPerThread = 2048;
    // const Index num_work_group =
    Index reductionGroup = dev.getPowerOfTwo(
        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
    const Index num_work_group = std::min(reductionGroup, local_range);
    // 1
    // ? local_range
    // : 1);
    const Index global_range = num_work_group * local_range;
    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
    if (num_work_group > 1) {
      CoeffReturnType *temp_pointer =
          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
                                                                      local_range, inputSize, reducer);
      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
                                                          EvaluatorPointerType, Index, local_range>
          GenericRKernel;
      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
          tmp_global_accessor, data,
          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
          reducer);
      dev.deallocate_temp(temp_pointer);
    } else {
      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
                                                                      reducer);
    }
  }
 };
 // vectorizable inner_most most dim preserver
 // col reduction
 template <typename Self, typename Op>
 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
                  typename Self::Index num_coeffs_to_preserve) {
    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
                                                                                 num_coeffs_to_reduce,
                                                                                 num_coeffs_to_preserve);
  }
 };
 // row reduction
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
                  typename Self::Index num_coeffs_to_preserve) {
    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
                                                                                 num_coeffs_to_reduce,
                                                                                 num_coeffs_to_preserve);
  }
 };
 // ArmgMax uses this kernel for partial reduction//
 // TODO(@mehdi.goli) come up with a better kernel
 // generic partial reduction
 template <typename Self, typename Op>
 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
                  typename Self::Index num_coeffs_to_preserve) {
    typename Self::Index range, GRange, tileSize;
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
-    /// creating the shared memory for calculating reduction.
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
-    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
+        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
-    /// recursively apply reduction on it in order to reduce the whole.
+        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
      dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
      dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
      // this is workaround for gcc 4.8 bug.
      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
      // create a tuple of accessors from Evaluator
      Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output);
      ptrdiff_t out_offset = dev.get_offset(output);
      Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
      TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
      (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
    });
    dev.asynchronousExec();
    return false;
  }
 };
-}  // end namespace internal
+}  // namespace internal
 }  // namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@ -0,0 +1,512 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorScanSycl.h
 *
 * \brief:
 *  Tensor Scan Sycl implement the extend  version of
 * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
 * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
 * the size of the tensor. In the first kernel (ScanKernelFunctor), each
 * threads within the work-group individually reduces the allocated elements per
 * thread in order to reduces the total number of blocks. In the next step all
 * thread within the work-group will reduce the associated blocks into the
 * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
 * buffer is given as an input and all the threads within a work-group scan and
 * reduces the boundaries between the blocks (generated from the previous
 * kernel). and write the data on the temporary buffer. If the second kernel is
 * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
 * adjust the final result into the output buffer.
 * The original algorithm for the parallel prefix sum can be found here:
 *
 * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
 * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
 *1, no. 1 (2008): 1-17.
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
 #endif
 template <typename index_t>
 struct ScanParameters {
  // must be power of 2
  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
  const index_t total_size;
  const index_t non_scan_size;
  const index_t scan_size;
  const index_t non_scan_stride;
  const index_t scan_stride;
  const index_t panel_threads;
  const index_t group_threads;
  const index_t block_threads;
  const index_t elements_per_group;
  const index_t elements_per_block;
  const index_t loop_range;
  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
      : total_size(total_size_),
        non_scan_size(non_scan_size_),
        scan_size(scan_size_),
        non_scan_stride(non_scan_stride_),
        scan_stride(scan_stride_),
        panel_threads(panel_threads_),
        group_threads(group_threads_),
        block_threads(block_threads_),
        elements_per_group(elements_per_group_),
        elements_per_block(elements_per_block_),
        loop_range(loop_range_) {}
 };
 enum class scan_step { first, second };
 template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
          scan_step stp>
 struct ScanKernelFunctor {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
  LocalAccessor scratch;
  Evaluator dev_eval;
  OutAccessor out_accessor;
  OutAccessor temp_accessor;
  const ScanParameters<Index> scanParameters;
  Op accumulator;
  const bool inclusive;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
                                                          const bool inclusive_)
      : scratch(scratch_),
        dev_eval(dev_eval_),
        out_accessor(out_accessor_),
        temp_accessor(temp_accessor_),
        scanParameters(scanParameters_),
        accumulator(accumulator_),
        inclusive(inclusive_) {}
  template <scan_step sst = stp, typename Input>
  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
      EIGEN_STRONG_INLINE
      read(const Input &inpt, Index global_id) {
    return inpt.coeff(global_id);
  }
  template <scan_step sst = stp, typename Input>
  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
      EIGEN_STRONG_INLINE
      read(const Input &inpt, Index global_id) {
    return inpt[global_id];
  }
  template <scan_step sst = stp, typename InclusiveOp>
  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
    inclusive_op();
  }
  template <scan_step sst = stp, typename InclusiveOp>
  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  first_step_inclusive_Operation(InclusiveOp) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    auto out_ptr = out_accessor.get_pointer();
    auto tmp_ptr = temp_accessor.get_pointer();
    auto scratch_ptr = scratch.get_pointer().get();
    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
      Index tmp = data_offset % scanParameters.panel_threads;
      const Index panel_id = data_offset / scanParameters.panel_threads;
      const Index group_id = tmp / scanParameters.group_threads;
      tmp = tmp % scanParameters.group_threads;
      const Index block_id = tmp / scanParameters.block_threads;
      const Index local_id = tmp % scanParameters.block_threads;
      // we put one element per packet in scratch_mem
      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
      CoeffReturnType inclusive_scan;
      // the actual panel size is scan_size * non_scan_size.
      // elements_per_panel is roundup to power of 2 for binary tree
      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
      const Index group_offset = group_id * scanParameters.non_scan_stride;
      // This will be effective when the size is bigger than elements_per_block
      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
      Index next_elements = 0;
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
        Index global_id = global_offset + next_elements;
        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
                           (global_id < scanParameters.total_size))
                              ? read(dev_eval, global_id)
                              : accumulator.initialize();
        next_elements += scanParameters.scan_stride;
      }
      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
        if (inclusive) {
          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
        }
      });
      // This for loop must be 2
      EIGEN_UNROLL_LOOP
      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
        Index private_offset = 1;
        // build sum in place up the tree
        EIGEN_UNROLL_LOOP
        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
          EIGEN_UNROLL_LOOP
          for (Index l = 0; l < d; l++) {
            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
            CoeffReturnType accum = accumulator.initialize();
            accumulator.reduce(private_scan[ai], &accum);
            accumulator.reduce(private_scan[bi], &accum);
            private_scan[bi] = accumulator.finalize(accum);
          }
          private_offset *= 2;
        }
        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
            private_scan[PacketSize - 1 + packetIndex];
        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
        // traverse down tree & build scan
        EIGEN_UNROLL_LOOP
        for (Index d = 1; d < PacketSize; d *= 2) {
          private_offset >>= 1;
          EIGEN_UNROLL_LOOP
          for (Index l = 0; l < d; l++) {
            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
            CoeffReturnType accum = accumulator.initialize();
            accumulator.reduce(private_scan[ai], &accum);
            accumulator.reduce(private_scan[bi], &accum);
            private_scan[ai] = private_scan[bi];
            private_scan[bi] = accumulator.finalize(accum);
          }
        }
      }
      Index offset = 1;
      // build sum in place up the tree
      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
        // Synchronise
        itemID.barrier(cl::sycl::access::fence_space::local_space);
        if (local_id < d) {
          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
          CoeffReturnType accum = accumulator.initialize();
          accumulator.reduce(scratch_ptr[ai], &accum);
          accumulator.reduce(scratch_ptr[bi], &accum);
          scratch_ptr[bi] = accumulator.finalize(accum);
        }
        offset *= 2;
      }
      // Synchronise
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      // next step optimisation
      if (local_id == 0) {
        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
                                    scanParameters.non_scan_size +
                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
                                block_id;
          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
        }
        // clear the last element
        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
      }
      // traverse down tree & build scan
      for (Index d = 1; d < scratch_stride; d *= 2) {
        offset >>= 1;
        // Synchronise
        itemID.barrier(cl::sycl::access::fence_space::local_space);
        if (local_id < d) {
          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
          CoeffReturnType accum = accumulator.initialize();
          accumulator.reduce(scratch_ptr[ai], &accum);
          accumulator.reduce(scratch_ptr[bi], &accum);
          scratch_ptr[ai] = scratch_ptr[bi];
          scratch_ptr[bi] = accumulator.finalize(accum);
        }
      }
      // Synchronise
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      // This for loop must be 2
      EIGEN_UNROLL_LOOP
      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
        EIGEN_UNROLL_LOOP
        for (Index i = 0; i < PacketSize; i++) {
          CoeffReturnType accum = private_scan[packetIndex + i];
          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
          private_scan[packetIndex + i] = accumulator.finalize(accum);
        }
      }
      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
        if (inclusive) {
          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
          private_scan[0] = accumulator.finalize(inclusive_scan);
        }
      });
      next_elements = 0;
      // right the first set of private param
      EIGEN_UNROLL_LOOP
      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
        Index global_id = global_offset + next_elements;
        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
             scanParameters.scan_size) &&
            (global_id < scanParameters.total_size)) {
          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
          out_ptr[global_id] = private_scan[private_id];
        }
        next_elements += scanParameters.scan_stride;
      }
    }  // end for loop
  }
 };
 template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
 struct ScanAdjustmentKernelFunctor {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
  InAccessor in_accessor;
  OutAccessor out_accessor;
  const ScanParameters<Index> scanParameters;
  Op accumulator;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
                                                                    OutAccessor out_accessor_,
                                                                    const ScanParameters<Index> scanParameters_,
                                                                    Op accumulator_)
      : in_accessor(in_accessor_),
        out_accessor(out_accessor_),
        scanParameters(scanParameters_),
        accumulator(accumulator_) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    auto in_ptr = in_accessor.get_pointer();
    auto out_ptr = out_accessor.get_pointer();
    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
      Index tmp = data_offset % scanParameters.panel_threads;
      const Index panel_id = data_offset / scanParameters.panel_threads;
      const Index group_id = tmp / scanParameters.group_threads;
      tmp = tmp % scanParameters.group_threads;
      const Index block_id = tmp / scanParameters.block_threads;
      const Index local_id = tmp % scanParameters.block_threads;
      // the actual panel size is scan_size * non_scan_size.
      // elements_per_panel is roundup to power of 2 for binary tree
      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
      const Index group_offset = group_id * scanParameters.non_scan_stride;
      // This will be effective when the size is bigger than elements_per_block
      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
      CoeffReturnType adjust_val = in_ptr[in_id];
      Index next_elements = 0;
      EIGEN_UNROLL_LOOP
      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
        Index global_id = global_offset + next_elements;
        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
             scanParameters.scan_size) &&
            (global_id < scanParameters.total_size)) {
          CoeffReturnType accum = adjust_val;
          accumulator.reduce(out_ptr[global_id], &accum);
          out_ptr[global_id] = accumulator.finalize(accum);
        }
        next_elements += scanParameters.scan_stride;
      }
    }
  }
 };
 template <typename Index>
 struct ScanInfo {
  const Index &total_size;
  const Index &scan_size;
  const Index &panel_size;
  const Index &non_scan_size;
  const Index &scan_stride;
  const Index &non_scan_stride;
  Index max_elements_per_block;
  Index block_size;
  Index panel_threads;
  Index group_threads;
  Index block_threads;
  Index elements_per_group;
  Index elements_per_block;
  Index loop_range;
  Index global_range;
  Index local_range;
  const Eigen::SyclDevice &dev;
  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
                               const Eigen::SyclDevice &dev_)
      : total_size(total_size_),
        scan_size(scan_size_),
        panel_size(panel_size_),
        non_scan_size(non_scan_size_),
        scan_stride(scan_stride_),
        non_scan_stride(non_scan_stride_),
        dev(dev_) {
    // must be power of 2
    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
    elements_per_group =
        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
    const Index elements_per_panel = elements_per_group * non_scan_size;
    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
    block_size = elements_per_group / elements_per_block;
 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
 #else
    const Index max_threads = panel_threads * panel_size;
 #endif
    global_range = roundUp(max_threads, local_range);
    loop_range = Index(
        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
  }
  inline ScanParameters<Index> get_scan_parameter() {
    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
  }
  inline cl::sycl::nd_range<1> get_thread_range() {
    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
  }
 };
 template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
 struct SYCLAdjustBlockOffset {
  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
                                                           Reducer &accumulator, const Index total_size,
                                                           const Index scan_size, const Index panel_size,
                                                           const Index non_scan_size, const Index scan_stride,
                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
    auto scan_info =
        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
        AdjustFuctor;
    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
                                                                      scan_info.max_elements_per_block,
                                                                      scan_info.get_scan_parameter(), accumulator);
  }
 };
 template <typename CoeffReturnType, scan_step stp>
 struct ScanLauncher_impl {
  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
                                             const Index total_size, const Index scan_size, const Index panel_size,
                                             const Index non_scan_size, const Index scan_stride,
                                             const Index non_scan_stride, const bool inclusive,
                                             const Eigen::SyclDevice &dev) {
    auto scan_info =
        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
    CoeffReturnType *temp_pointer =
        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
        scan_info.get_scan_parameter(), accumulator, inclusive);
    if (scan_info.block_size > 1) {
      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
          non_scan_size, Index(1), scan_info.block_size, false, dev);
      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
          non_scan_stride, dev);
    }
    dev.deallocate_temp(temp_pointer);
  }
 };
 }  // namespace internal
 }  // namespace TensorSycl
 template <typename Self, typename Reducer>
 struct ScanLauncher<Self, Reducer, Eigen::SyclDevice> {
  typedef typename Self::Index Index;
  typedef typename Self::CoeffReturnType CoeffReturnType;
  typedef typename Self::Storage Storage;
  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
  void operator()(Self &self, EvaluatorPointerType data) {
    const Index total_size = internal::array_prod(self.dimensions());
    const Index scan_size = self.size();
    const Index scan_stride = self.stride();
    // this is the scan op (can be sum or ...)
    auto accumulator = self.accumulator();
    auto inclusive = !self.exclusive();
    auto consume_dim = self.consume_dim();
    auto dev = self.device();
    auto dims = self.inner().dimensions();
    Index non_scan_size = 1;
    Index panel_size = 1;
    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
      for (int i = 0; i < consume_dim; i++) {
        non_scan_size *= dims[i];
      }
      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
        panel_size *= dims[i];
      }
    } else {
      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
        non_scan_size *= dims[i];
      }
      for (int i = consume_dim - 1; i >= 0; i--) {
        panel_size *= dims[i];
      }
    }
    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
    auto eval_impl = self.inner();
    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
        inclusive, dev);
  }
 };
 }  // namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@ -1,120 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: eigen@codeplay.com
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // General include header of SYCL target for Tensor Module
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
 #ifdef EIGEN_USE_SYCL
 // global pointer to set different attribute state for a class
 template <class T>
 struct MakeGlobalPointer {
  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
  typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
 };
 // global pointer to set different attribute state for a class
 template <class T>
 struct MakeLocalPointer {
  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
  typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
 };
 namespace Eigen {
  template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp;
  template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>;
 namespace internal {
 #ifdef __SYCL_DEVICE_ONLY__
 template<typename A, typename B> struct TypeConversion {
  template<typename T>
  static typename MakeGlobalPointer<A>::Type get_address_space_pointer(typename MakeGlobalPointer<T>::Type p);
  template<typename T>
  static typename MakeLocalPointer<A>::Type get_address_space_pointer(typename MakeLocalPointer<T>::Type p);
  template<typename T>
  static A* get_address_space_pointer(T* p);
  typedef decltype(get_address_space_pointer(B())) type;
 };
 #endif
 }
 namespace TensorSycl {
 namespace internal {
  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
 /// This struct is used for special expression nodes with no operations (for example assign and selectOP).
  struct NoOP;
 template<bool IsConst, typename T> struct GetType{
  typedef const T Type;
 };
 template<typename T> struct GetType<false, T>{
  typedef T Type;
 };
 template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
  static constexpr size_t Res =X;
 };
 template<size_t X, size_t Y> struct ValueCondition<false, X, Y> {
  static constexpr size_t Res =Y;
 };
 }
 }
 }
 // tuple construction
 #include "TensorSyclTuple.h"
 // counting number of leaf at compile time
 #include "TensorSyclLeafCount.h"
 // The index PlaceHolder takes the actual expression and replaces the actual
 // data on it with the place holder. It uses the same pre-order expression tree
 // traverse as the leaf count in order to give the right access number to each
 // node in the expression
 #include "TensorSyclPlaceHolderExpr.h"
 // creation of an accessor tuple from a tuple of SYCL buffers
 #include "TensorSyclExtractAccessor.h"
 // this is used to change the address space type in tensor map for GPU
 #include "TensorSyclConvertToDeviceExpression.h"
 // this is used to extract the functors
 #include "TensorSyclExtractFunctors.h"
 // this is used to create tensormap on the device
 // this is used to construct the expression on the device
 #include "TensorSyclExprConstructor.h"
 /// this is used for extracting tensor reduction
 #include "TensorReductionSycl.h"
 // TensorArgMaxSycl.h
 #include "TensorArgMaxSycl.h"
 /// this is used for extracting tensor convolution
 #include "TensorConvolutionSycl.h"
 // kernel execution using fusion
 #include "TensorSyclRun.h"
 //sycl functors
 #include "TensorSyclFunctors.h"
 #include "TensorContractionSycl.h"
 #endif  // end of EIGEN_USE_SYCL
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@ -1,205 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclConvertToDeviceExpression.h
 *
 * \brief:
 *  Conversion from host pointer to device pointer
 *  inside leaf nodes of the expression.
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 /// \struct ConvertToDeviceExpression
 /// \brief This struct is used to convert the MakePointer in the host expression
 /// to the MakeGlobalPointer for the device expression. For the leafNodes
 /// containing the pointer. This is due to the fact that the address space of
 /// the pointer T* is different on the host and the device.
 template <typename Expr>
 struct ConvertToDeviceExpression;
 template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
 struct NonOpConversion{
  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
 };
 template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
 struct DeviceConvertor{
  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
 };
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node
 /// type is TensorMap
 #define TENSORMAPCONVERT(CVQual)\
 template <typename T,  int Options_, template <class> class MakePointer_>\
 struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\
  typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
 };
 TENSORMAPCONVERT(const)
 TENSORMAPCONVERT()
 #undef TENSORMAPCONVERT
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node
 /// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
 #define CATEGORYCONVERT(CVQual)\
 template <template<class, class...> class Category, typename OP, typename... subExprs>\
 struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
  typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
 };
 CATEGORYCONVERT(const)
 CATEGORYCONVERT()
 #undef CATEGORYCONVERT
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node
 /// type is  TensorCwiseSelectOp
 #define SELECTOPCONVERT(CVQual, Res)\
 template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
 struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
 : NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
 SELECTOPCONVERT(const, true)
 SELECTOPCONVERT(, false)
 #undef SELECTOPCONVERT
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node
 /// type is const AssingOP
 #define ASSIGNCONVERT(CVQual, Res)\
 template <typename LHSExpr, typename RHSExpr>\
 struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
 : NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
 ASSIGNCONVERT(const, true)
 ASSIGNCONVERT(, false)
 #undef ASSIGNCONVERT
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node
 /// type is  TensorEvalToOp
 #define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
 template <typename Expr>\
 struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
 : DeviceConvertor<ExprNode, Res, Expr>{};
 KERNELBROKERCONVERT(const, true, TensorEvalToOp)
 KERNELBROKERCONVERT(, false, TensorEvalToOp)
 #undef KERNELBROKERCONVERT
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp
 #define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\
 template <typename Expr>\
 struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
  typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
 };
 // TensorForcedEvalOp
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp)
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp)
 // TensorLayoutSwapOp
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp)
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp)
 //TensorIndexTupleOp
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp)
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp)
 #undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
 #define KERNELBROKERCONVERTREDUCTION(CVQual)\
 template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
 struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
 };
 KERNELBROKERCONVERTREDUCTION(const)
 KERNELBROKERCONVERTREDUCTION()
 #undef KERNELBROKERCONVERTREDUCTION
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
 #define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\
 template <typename OP, typename Dim, typename subExpr>\
 struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\
  typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\
 };
 KERNELBROKERCONVERTTUPLEREDUCTION(const)
 KERNELBROKERCONVERTTUPLEREDUCTION()
 #undef KERNELBROKERCONVERTTUPLEREDUCTION
 //TensorSlicingOp
 #define KERNELBROKERCONVERTSLICEOP(CVQual)\
 template<typename StartIndices, typename Sizes, typename XprType>\
 struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\
 };
 KERNELBROKERCONVERTSLICEOP(const)
 KERNELBROKERCONVERTSLICEOP()
 #undef KERNELBROKERCONVERTSLICEOP
 //TensorStridingSlicingOp
 #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
 struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\
 };
 KERNELBROKERCONVERTERSLICESTRIDEOP(const)
 KERNELBROKERCONVERTERSLICESTRIDEOP()
 #undef KERNELBROKERCONVERTERSLICESTRIDEOP
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
 #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
 template <DenseIndex DimId, typename Expr>\
 struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
  typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
 };
 KERNELBROKERCONVERTCHIPPINGOP(const)
 KERNELBROKERCONVERTCHIPPINGOP()
 #undef KERNELBROKERCONVERTCHIPPINGOP
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp
 #define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
 struct ConvertToDeviceExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType> >{\
  typedef CVQual TensorImagePatchOp<Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
 };
 KERNELBROKERCONVERTIMAGEPATCHOP(const)
 KERNELBROKERCONVERTIMAGEPATCHOP()
 #undef KERNELBROKERCONVERTIMAGEPATCHOP
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp
 #define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\
 template<DenseIndex Plannes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
 struct ConvertToDeviceExpression<CVQual TensorVolumePatchOp<Plannes, Rows, Cols, XprType> >{\
  typedef CVQual TensorVolumePatchOp<Plannes, Rows, Cols, typename ConvertToDeviceExpression<XprType>::Type> Type;\
 };
 KERNELBROKERCONVERTVOLUMEPATCHOP(const)
 KERNELBROKERCONVERTVOLUMEPATCHOP()
 #undef KERNELBROKERCONVERTVOLUMEPATCHOP
 }  // namespace internal
 }  // namespace TensorSycl
 }  // namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX1
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@ -1,514 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclExprConstructor.h
 *
 * \brief:
 *  This file re-create an expression on the SYCL device in order
 *  to use the original tensor evaluator.
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 template <typename Expr, typename Dims>
 struct DeviceFixedSizeTensor;
 template <typename Expr, typename std::ptrdiff_t... Indices>
 struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{
  template<typename Data>
  static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);}
 };
 /// this class is used by EvalToOp in order to create an lhs expression which is
 /// a pointer from an accessor on device-only buffer
 template <typename PtrType, size_t N, typename... Params>
 struct EvalToLHSConstructor {
  PtrType expr;
  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {}
 };
 /// struct ExprConstructor is used to reconstruct the expression on the device and
 /// recreate the expression with MakeGlobalPointer containing the device address
 /// space for the TensorMap pointers used in eval function.
 /// It receives the original expression type, the functor of the node, the tuple
 /// of accessors, and the device expression type to re-instantiate the
 /// expression tree for the device
 template <typename OrigExpr, typename IndexExpr, typename... Params>
 struct ExprConstructor;
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorMap
 #define TENSORMAP(CVQual)\
 template <typename T,  int Options_,\
 template <class> class MakePointer_, size_t N, typename... Params>\
 struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\
 CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
  typedef  CVQual TensorMap<T, Options_, MakeGlobalPointer>  Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
 };
 TENSORMAP(const)
 TENSORMAP()
 #undef TENSORMAP
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorMap
 #define TENSORMAPFIXEDSIZE(CVQual)\
 template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\
 template <class> class MakePointer_, size_t N, typename... Params>\
 struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\
 CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\
  typedef  CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>  Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
  : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
 };
 TENSORMAPFIXEDSIZE(const)
 TENSORMAPFIXEDSIZE()
 #undef TENSORMAPFIXEDSIZE
 #define UNARYCATEGORY(CVQual)\
 template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
 struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
  my_type rhsExpr;\
  typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
 };
 UNARYCATEGORY(const)
 UNARYCATEGORY()
 #undef UNARYCATEGORY
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorBinaryOp
 #define BINARYCATEGORY(CVQual)\
 template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
 typename RHSExpr, typename... Params>\
 struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>,  CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
  typedef  ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
  typedef  CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
  my_left_type lhsExpr;\
  my_right_type rhsExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
 };
 BINARYCATEGORY(const)
 BINARYCATEGORY()
 #undef BINARYCATEGORY
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorCwiseTernaryOp
 #define TERNARYCATEGORY(CVQual)\
 template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
 typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
 struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
  typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
  typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
  typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
  typedef  CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
  my_arg1_type arg1Expr;\
  my_arg2_type arg2Expr;\
  my_arg3_type arg3Expr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
  : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
 };
 TERNARYCATEGORY(const)
 TERNARYCATEGORY()
 #undef TERNARYCATEGORY
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorCwiseSelectOp
 #define SELECTOP(CVQual)\
 template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
 struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
  typedef  ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
  typedef  ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
  typedef  ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
  typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
  my_if_type ifExpr;\
  my_then_type thenExpr;\
  my_else_type elseExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
 };
 SELECTOP(const)
 SELECTOP()
 #undef SELECTOP
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// const TensorAssignOp
 #define ASSIGN(CVQual)\
 template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
 struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
  typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
  typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
  typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type>  Type;\
  my_left_type lhsExpr;\
  my_right_type rhsExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
 };
 ASSIGN(const)
 ASSIGN()
 #undef ASSIGN
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// const TensorAssignOp
 #define CONVERSIONEXPRCONST(CVQual)\
 template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
 struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>,  CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
   typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
   typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type>  Type;\
   my_nested_type nestedExpr;\
   Type expr;\
   template <typename FuncDetector>\
   ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
   : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
  };
  CONVERSIONEXPRCONST(const)
  CONVERSIONEXPRCONST()
  #undef CONVERSIONEXPRCONST
 /// specialisation of the \ref ExprConstructor struct when the node type is
 ///  TensorEvalToOp /// 0 here is the output number in the buffer
 #define EVALTO(CVQual)\
 template <typename OrigExpr, typename Expr, typename... Params>\
 struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
  typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
  typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
  typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
  my_expr_type nestedExpression;\
  EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
 };
 EVALTO(const)
 EVALTO()
 #undef EVALTO
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorForcedEvalOp
 #define FORCEDEVAL(CVQual)\
 template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
 CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
  typedef  TensorForcedEvalOp<OrigExpr> XprType;\
  typedef CVQual TensorMap<\
                            Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
                            Eigen::internal::traits<XprType>::Layout, \
                            MakeGlobalPointer\
                          > Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
 };
 FORCEDEVAL(const)
 FORCEDEVAL()
 #undef FORCEDEVAL
 #define TENSORCUSTOMUNARYOP(CVQual)\
 template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
 CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
  typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
  typedef CVQual TensorMap<\
                            Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
                            Eigen::internal::traits<XprType>::Layout, \
                            MakeGlobalPointer\
                          > Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
 };
 TENSORCUSTOMUNARYOP(const)
 TENSORCUSTOMUNARYOP()
 #undef TENSORCUSTOMUNARYOP
 /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
 #define SYCLREDUCTIONEXPR(CVQual)\
 template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
 CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
  static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
  NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
 };
 SYCLREDUCTIONEXPR(const)
 SYCLREDUCTIONEXPR()
 #undef SYCLREDUCTIONEXPR
 /// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp
 /// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP.
 #define SYCLTUPLEREDUCTIONEXPR(CVQual)\
 template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\
 CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\
  static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\
  static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\
 static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\
  typedef CVQual TensorMap<\
                          Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\
                          Layout,\
                          MakeGlobalPointer\
                          > XprType;\
  typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\
  static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\
  typedef array<Index, NumDims> StrideDims;\
  typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\
    fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\
    }\
 };
 SYCLTUPLEREDUCTIONEXPR(const)
 SYCLTUPLEREDUCTIONEXPR()
 #undef SYCLTUPLEREDUCTIONEXPR
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
 #define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
 template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
 struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
 CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType,  RhsXprType>, N>, Params...> {\
  typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
  static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
  typedef CVQual TensorMap<\
                            Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
                            Eigen::internal::traits<XprTyp>::Layout, \
                            MakeGlobalPointer\
                          > Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
 };
 //TensorContractionOp
 SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
 SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
 //TensorConvolutionOp
 SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
 SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
 //TensorCustomBinaryOp
 SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
 SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
 #undef SYCLCONTRACTCONVCUSBIOPS
 //TensorSlicingOp
 #define SYCLSLICEOPEXPR(CVQual)\
 template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\
 };
 SYCLSLICEOPEXPR(const)
 SYCLSLICEOPEXPR()
 #undef SYCLSLICEOPEXPR
 //TensorStridingSlicingOp
 #define SYCLSLICESTRIDEOPEXPR(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\
 };
 SYCLSLICESTRIDEOPEXPR(const)
 SYCLSLICESTRIDEOPEXPR()
 #undef SYCLSLICESTRIDEOPEXPR
 //TensorReshapingOp and TensorShufflingOp
 #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
 template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
 };
 // TensorReshapingOp
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
 // TensorShufflingOp
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
 #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
 //TensorPaddingOp
 #define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
 template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
 };
 //TensorPaddingOp
 SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
 SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
 #undef SYCLPADDINGOPEXPRCONST
 // TensorChippingOp
 #define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
 template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
 };
 SYCLTENSORCHIPPINGOPEXPR(const)
 SYCLTENSORCHIPPINGOPEXPR()
 #undef SYCLTENSORCHIPPINGOPEXPR
 // TensorImagePatchOp
 #define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
 struct  ExprConstructor<CVQual TensorImagePatchOp<Rows, Cols, OrigXprType>, CVQual TensorImagePatchOp<Rows, Cols, XprType>, Params... > {\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual TensorImagePatchOp<Rows, Cols, typename my_xpr_type::Type> Type;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\
    funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \
    funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\
 };
 SYCLTENSORIMAGEPATCHOPEXPR(const)
 SYCLTENSORIMAGEPATCHOPEXPR()
 #undef SYCLTENSORIMAGEPATCHOPEXPR
 // TensorVolumePatchOp
 #define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename OrigXprType, typename XprType, typename... Params>\
 struct  ExprConstructor<CVQual TensorVolumePatchOp<Planes, Rows, Cols, OrigXprType>, CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Params... > {\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, typename my_xpr_type::Type> Type;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\
    funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \
    funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \
    funcD.m_padding_type, funcD.m_padding_value ){\
    }\
 };
 SYCLTENSORVOLUMEPATCHOPEXPR(const)
 SYCLTENSORVOLUMEPATCHOPEXPR()
 #undef SYCLTENSORVOLUMEPATCHOPEXPR
 // TensorLayoutSwapOp and TensorIndexTupleOp
 #define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\
 template<typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\
  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
  typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\
  my_xpr_type xprExpr;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\
 };
 //TensorLayoutSwapOp
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp)
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp)
 //TensorIndexTupleOp
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp)
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp)
 #undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR
 /// template deduction for \ref ExprConstructor struct
 template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
 auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
 }
 } /// namespace TensorSycl
 } /// namespace internal
 } /// namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@ -1,310 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclExtractAccessor.h
 *
 * \brief:
 * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
 * buffers as an input. Using pre-order tree traversal, ExtractAccessor
 * recursively calls itself for its children in the expression tree. The
 * leaf node in the PlaceHolder expression is nothing but a container preserving
 * the order of the actual data in the tuple of sycl buffer. By invoking the
 * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
 * buffer in the tuple of buffers. This accessor is then added as an Nth
 * element in the tuple of accessors. In this case we preserve the order of data
 * in the expression tree.
 *
 * This is the specialisation of extract accessor method for different operation
 * type in the PlaceHolder expression.
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 #define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
 /// struct ExtractAccessor: Extract Accessor Class is used to extract the
 /// accessor from a buffer.
 /// Depending on the type of the leaf node we can get a read accessor or a
 /// read_write accessor
 template <typename Evaluator>
 struct ExtractAccessor;
 struct AccessorConstructor{
  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
  RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
  RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
 };
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 ///  TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
 #define SYCLUNARYCATEGORYEXTACC(CVQual)\
 template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
 RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 SYCLUNARYCATEGORYEXTACC(const)
 SYCLUNARYCATEGORYEXTACC()
 #undef SYCLUNARYCATEGORYEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
 #define SYCLBINARYCATEGORYEXTACC(CVQual)\
 template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
 };
 SYCLBINARYCATEGORYEXTACC(const)
 SYCLBINARYCATEGORYEXTACC()
 #undef SYCLBINARYCATEGORYEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// const TensorCwiseTernaryOp
 #define SYCLTERNARYCATEGORYEXTACC(CVQual)\
 template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
 };
 SYCLTERNARYCATEGORYEXTACC(const)
 SYCLTERNARYCATEGORYEXTACC()
 #undef SYCLTERNARYCATEGORYEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorCwiseSelectOp. This is a special case where there is no OP
 #define SYCLSELECTOPEXTACC(CVQual)\
 template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
 };
 SYCLSELECTOPEXTACC(const)
 SYCLSELECTOPEXTACC()
 #undef SYCLSELECTOPEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
 #define SYCLTENSORASSIGNOPEXTACC(CVQual)\
 template <typename LHSExpr, typename RHSExpr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
 };
 SYCLTENSORASSIGNOPEXTACC(const)
 SYCLTENSORASSIGNOPEXTACC()
 #undef SYCLTENSORASSIGNOPEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
 #define TENSORMAPEXPR(CVQual, ACCType)\
 template <typename PlainObjectType, int Options_, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
 };
 TENSORMAPEXPR(const, cl::sycl::access::mode::read)
 TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
 #undef TENSORMAPEXPR
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
 #define SYCLFORCEDEVALEXTACC(CVQual)\
 template <typename Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
 SYCLFORCEDEVALEXTACC(const)
 SYCLFORCEDEVALEXTACC()
 #undef SYCLFORCEDEVALEXTACC
 //TensorCustomUnaryOp
 #define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
 template <typename CustomUnaryFunc, typename XprType, typename Dev >\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
 SYCLCUSTOMUNARYOPEXTACC(const)
 SYCLCUSTOMUNARYOPEXTACC()
 #undef SYCLCUSTOMUNARYOPEXTACC
 //TensorCustomBinaryOp
 #define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
 template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
 SYCLCUSTOMBINARYOPEXTACC(const)
 SYCLCUSTOMBINARYOPEXTACC()
 #undef SYCLCUSTOMBIBARYOPEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
 #define SYCLEVALTOEXTACC(CVQual)\
 template <typename Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
  RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
 };
 SYCLEVALTOEXTACC(const)
 SYCLEVALTOEXTACC()
 #undef SYCLEVALTOEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
 #define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\
 template <typename OP, typename Dim, typename Expr, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
 // TensorReductionOp
 SYCLREDUCTIONEXTACC(const,TensorReductionOp)
 SYCLREDUCTIONEXTACC(,TensorReductionOp)
 // TensorTupleReducerOp
 SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp)
 SYCLREDUCTIONEXTACC(,TensorTupleReducerOp)
 #undef SYCLREDUCTIONEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
 #define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
 template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
 //TensorContractionOp
 SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
 SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
 //TensorConvolutionOp
 SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
 SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
 #undef SYCLCONTRACTIONCONVOLUTIONEXTACC
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// const TensorSlicingOp.
 #define SYCLSLICEOPEXTACC(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
  RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 SYCLSLICEOPEXTACC(const)
 SYCLSLICEOPEXTACC()
 #undef SYCLSLICEOPEXTACC
 // specialisation of the \ref ExtractAccessor struct when the node type is
 ///  TensorStridingSlicingOp.
 #define SYCLSLICESTRIDEOPEXTACC(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 SYCLSLICESTRIDEOPEXTACC(const)
 SYCLSLICESTRIDEOPEXTACC()
 #undef SYCLSLICESTRIDEOPEXTACC
 // specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorChippingOp.
 #define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
 template<DenseIndex DimId, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 SYCLTENSORCHIPPINGOPEXTACC(const)
 SYCLTENSORCHIPPINGOPEXTACC()
 #undef SYCLTENSORCHIPPINGOPEXTACC
 // specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorImagePatchOp.
 #define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev> >{\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 SYCLTENSORIMAGEPATCHOPEXTACC(const)
 SYCLTENSORIMAGEPATCHOPEXTACC()
 #undef SYCLTENSORIMAGEPATCHOPEXTACC
 // specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorVolumePatchOp.
 #define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev> >{\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 SYCLTENSORVOLUMEPATCHOPEXTACC(const)
 SYCLTENSORVOLUMEPATCHOPEXTACC()
 #undef SYCLTENSORVOLUMEPATCHOPEXTACC
 // specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorLayoutSwapOp, TensorIndexTupleOp
 #define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\
 template<typename XprType, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\
  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\
  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 // TensorLayoutSwapOp
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp)
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp)
 //TensorIndexTupleOp
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp)
 SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp)
 #undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC
 /// template deduction for \ref ExtractAccessor
 template <typename Evaluator>
 auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
 -> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) {
  return ExtractAccessor<Evaluator>::getTuple(cgh, eval);
 }
 } /// namespace TensorSycl
 } /// namespace internal
 } /// namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@ -1,467 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclextractFunctors.h
 *
 * \brief:
 *  Used to extract all the functors allocated to each node of the expression
 *tree.
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 /// struct FunctorExtractor:  This struct is used to extract the functors
 /// constructed on
 /// the host-side, to pack them and reuse them in reconstruction of the
 /// expression on the device.
 /// We have to do that as in Eigen the functors are not stateless so we cannot
 /// re-instantiate them on the device.
 /// We have to pass instantiated functors to the device.
 // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
 #define DEFALTACTION(Evaluator)\
 typedef typename Evaluator::Dimensions Dimensions;\
 const Dimensions m_dimensions;\
 EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
 FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
 template <typename Evaluator> struct FunctorExtractor{
  DEFALTACTION(Evaluator)
 };
 /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
 ///TensorConversionOp
 #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
 template <typename ArgType1, typename ArgType2, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
  FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
  : subExpr(expr.impl()) {}\
 };
 SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
 SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
 #undef SYCLEXTRFUNCCONVERSION
 #define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
 template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
 struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
 FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\
 };
 SYCLEXTRTENSORMAPFIXEDSIZE(const)
 SYCLEXTRTENSORMAPFIXEDSIZE()
 #undef SYCLEXTRTENSORMAPFIXEDSIZE
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// TensorCwiseNullaryOp,  TensorCwiseUnaryOp, and  TensorBroadcastingOp
 #define SYCLEXTRFUNCUNARY(CVQual)\
 template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
  const OP func;\
  FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
  : rhsExpr(expr.impl()), func(expr.functor()) {}\
 };
 SYCLEXTRFUNCUNARY(const)
 SYCLEXTRFUNCUNARY()
 #undef SYCLEXTRFUNCUNARY
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// TensorCwiseBinaryOp
 #define SYCLEXTRFUNCBIINARY(CVQual)\
 template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
  const OP func;\
  FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
 };
 SYCLEXTRFUNCBIINARY(const)
 SYCLEXTRFUNCBIINARY()
 #undef SYCLEXTRFUNCBIINARY
 /// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp
 #define SYCLEXTRFUNCTERNARY(CVQual)\
 template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
  const OP func;\
  FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
 };
 SYCLEXTRFUNCTERNARY(const)
 SYCLEXTRFUNCTERNARY()
 #undef SYCLEXTRFUNCTERNARY
 //TensorCustomOp must be specialised otherwise it will be captured by UnaryCategory while its action is different
 //from the UnaryCategory and it is similar to the general FunctorExtractor.
 /// specialisation of TensorCustomOp
 #define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
 template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
  typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
  DEFALTACTION(Evaluator)\
 };
 //TensorCustomUnaryOp
 SYCLEXTRFUNCCUSTOMUNARYOP(const)
 SYCLEXTRFUNCCUSTOMUNARYOP()
 #undef SYCLEXTRFUNCCUSTOMUNARYOP
 //TensorCustomBinaryOp
 #define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
 template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
  typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
  DEFALTACTION(Evaluator)\
 };
 //TensorCustomBinaryOp
 SYCLEXTRFUNCCUSTOMBIBARYOP(const)
 SYCLEXTRFUNCCUSTOMBIBARYOP()
 #undef SYCLEXTRFUNCCUSTOMBIBARYOP
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCSELECTOP(CVQual)\
 template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
 struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\
  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\
  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\
  FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\
  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\
 };
 SYCLEXTRFUNCSELECTOP(const)
 SYCLEXTRFUNCSELECTOP()
 #undef SYCLEXTRFUNCSELECTOP
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCASSIGNOP(CVQual)\
 template <typename LHSExpr, typename RHSExpr, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
  FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\
  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\
 };
 SYCLEXTRFUNCASSIGNOP(const)
 SYCLEXTRFUNCASSIGNOP()
 #undef SYCLEXTRFUNCASSIGNOP
 /// specialisation of the \ref FunctorExtractor struct when the node types are
 /// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\
 template <typename Expr, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\
  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Expr>, Dev>& expr)\
  : xprExpr(expr.impl()) {}\
 };
 //TensorEvalToOp
 SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp)
 SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp)
 // TensorLayoutSwapOp
 SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp)
 SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp)
 // TensorIndexTupleOp
 SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp)
 SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp)
 #undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE
 template<typename Dim, size_t NumOutputDim> struct DimConstr {
 template<typename InDim>
  static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;}
 };
 template<typename Dim> struct DimConstr<Dim, 0> {
  template<typename InDim>
    static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
 };
 //TensorReductionOp
 #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\
  typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
  const Dimensions m_dimensions;\
  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
  FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
 };
 SYCLEXTRFUNCREDUCTIONOP(const)
 SYCLEXTRFUNCREDUCTIONOP()
 #undef SYCLEXTRFUNCREDUCTIONOP
 //TensorTupleReducerOp
 #define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\
 template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\
 typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\
 static const int  NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\
 typedef typename Evaluator::StrideDims StrideDims;\
 typedef typename Evaluator::Index Index;\
 typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
 const Dimensions m_dimensions;\
 const Index m_return_dim;\
 const StrideDims m_strides;\
 const Index m_stride_mod;\
 const Index m_stride_div;\
 EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
 EIGEN_STRONG_INLINE  Index return_dim() const {return m_return_dim;}\
 EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\
 EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\
 EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\
 FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\
 : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\
   m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\
 };
 SYCLEXTRFUNCTUPLEREDUCTIONOP(const)
 SYCLEXTRFUNCTUPLEREDUCTIONOP()
 #undef SYCLEXTRFUNCTUPLEREDUCTIONOP
 //TensorContractionOp and TensorConvolutionOp
 #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
 template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
  typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
  typedef typename Evaluator::Dimensions Dimensions;\
  const Dimensions m_dimensions;\
  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
  : m_dimensions(expr.dimensions()) {}\
 };
 //TensorContractionOp
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
 //TensorConvolutionOp
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
 #undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCTSLICEOP(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
  FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
  const StartIndices m_offsets;\
  const Sizes m_dimensions;\
  FunctorExtractor(const TensorEvaluator<CVQual  TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\
  : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\
  EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\
  EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\
 };
 SYCLEXTRFUNCTSLICEOP(const)
 SYCLEXTRFUNCTSLICEOP()
 #undef SYCLEXTRFUNCTSLICEOP
 //TensorStridingSlicingOp
 #define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
  FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
  const StartIndices m_startIndices;\
  const StopIndices m_stopIndices;\
  const Strides m_strides;\
  FunctorExtractor(const TensorEvaluator<CVQual  TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\
  : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\
  EIGEN_STRONG_INLINE  const StartIndices& startIndices() const { return m_startIndices; }\
  EIGEN_STRONG_INLINE  const StartIndices& stopIndices() const { return m_stopIndices; }\
  EIGEN_STRONG_INLINE  const StartIndices& strides() const { return m_strides; }\
 };
 SYCLEXTRFUNCTSLICESTRIDEOP(const)
 SYCLEXTRFUNCTSLICESTRIDEOP()
 #undef SYCLEXTRFUNCTSLICESTRIDEOP
 // Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory
 #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
 template<typename Param, typename XprType, typename Dev>\
 struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
  FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
  const Param m_param;\
  EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
  FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
  : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
 };
 //TensorReshapingOp
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
 //TensorShufflingOp
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
 #undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
 // Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
 #define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\
 template<typename Param, typename XprType, typename Dev>\
 struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
  FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
  const Param m_param;\
  typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\
  const Scalar m_scalar_param;\
  EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
  EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\
  FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
  : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL)  {}\
 };
 PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
 PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
 #undef PADDINGOPFUNCEXT
 /// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
 /// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
 #define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
 template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
  const Param func;\
  FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
 };
 // TensorConcatenationOp
 SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
 SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
 #undef SYCLEXTRFUNCCONTRACTCONCAT
 //TensorChippingOp
 #define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
 template<DenseIndex DimId, typename XprType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device> >{\
  FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
  const DenseIndex m_dim;\
  const DenseIndex m_offset;\
  EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
  EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
  FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
  : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
 };
 SYCLEXTRFUNCCHIPPINGOP(const)
 SYCLEXTRFUNCCHIPPINGOP()
 #undef SYCLEXTRFUNCCHIPPINGOP
 //TensorImagePatchOp
 #define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\
 typedef CVQual TensorImagePatchOp<Rows, Cols, XprType> Self;\
 FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
 const DenseIndex m_patch_rows;\
 const DenseIndex m_patch_cols;\
 const DenseIndex m_row_strides;\
 const DenseIndex m_col_strides;\
 const DenseIndex m_in_row_strides;\
 const DenseIndex m_in_col_strides;\
 const DenseIndex m_row_inflate_strides;\
 const DenseIndex m_col_inflate_strides;\
 const bool m_padding_explicit;\
 const DenseIndex m_padding_top;\
 const DenseIndex m_padding_bottom;\
 const DenseIndex m_padding_left;\
 const DenseIndex m_padding_right;\
 const PaddingType m_padding_type;\
 const typename Self::Scalar m_padding_value;\
 FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
 : xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
  m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
  m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
  m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\
  m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\
  m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
  m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\
  m_padding_value(expr.xpr().padding_value()){}\
 };
 SYCLEXTRFUNCIMAGEPATCHOP(const)
 SYCLEXTRFUNCIMAGEPATCHOP()
 #undef SYCLEXTRFUNCIMAGEPATCHOP
 /// TensorVolumePatchOp
 #define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Device> >{\
 typedef CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> Self;\
 FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
 const DenseIndex m_patch_planes;\
 const DenseIndex m_patch_rows;\
 const DenseIndex m_patch_cols;\
 const DenseIndex m_plane_strides;\
 const DenseIndex m_row_strides;\
 const DenseIndex m_col_strides;\
 const DenseIndex m_in_plane_strides;\
 const DenseIndex m_in_row_strides;\
 const DenseIndex m_in_col_strides;\
 const DenseIndex m_plane_inflate_strides;\
 const DenseIndex m_row_inflate_strides;\
 const DenseIndex m_col_inflate_strides;\
 const bool m_padding_explicit;\
 const DenseIndex m_padding_top_z;\
 const DenseIndex m_padding_bottom_z;\
 const DenseIndex m_padding_top;\
 const DenseIndex m_padding_bottom;\
 const DenseIndex m_padding_left;\
 const DenseIndex m_padding_right;\
 const PaddingType m_padding_type;\
 const typename Self::Scalar m_padding_value;\
 FunctorExtractor(const TensorEvaluator<Self, Device>& expr)\
 : xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\
  m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\
  m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\
  m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\
  m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\
  m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \
  m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\
  m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\
 };
 SYCLEXTRFUNCVOLUMEPATCHOP(const)
 SYCLEXTRFUNCVOLUMEPATCHOP()
 #undef SYCLEXTRFUNCVOLUMEPATCHOP
 /// template deduction function for FunctorExtractor
 template <typename Evaluator>
 auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
  return FunctorExtractor<Evaluator>(evaluator);
 }
 }  // namespace internal
 }  // namespace TensorSycl
 }  // namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@ -1,248 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: eigen@codeplay.com
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // General include header of SYCL target for Tensor Module
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
    OP op;
    OutputAccessor aOut;
    ptrdiff_t out_offset;
    InputAccessor aI;
    LocalAccessor scratch;
    size_t length, local;
    GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
    : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
    void operator()(cl::sycl::nd_item<1> itemID) {
      size_t globalid = itemID.get_global(0);
      size_t localid = itemID.get_local(0);
      /* All threads collectively read from global memory into local.
       * The barrier ensures all threads' IO is resolved before
       * execution continues (strictly speaking, all threads within
       * a single work-group - there is no co-ordination between
       * work-groups, only work-items). */
      if (globalid < length) {
        scratch[localid] = aI[globalid];
      }
      itemID.barrier(cl::sycl::access::fence_space::local_space);
      /* Apply the reduction operation between the current local
       * id and the one on the other half of the vector. */
      if (globalid < length) {
        auto min = (length < local) ? length : local;
        for (size_t offset = min / 2; offset > 0; offset /= 2) {
          if (localid < offset) {
            auto accum = op.initialize();
            op.reduce(scratch[localid], &accum);
            op.reduce(scratch[localid + offset], &accum);
            op.finalize(accum);
            scratch[localid]=accum;
            //scratch[localid] += scratch[localid + offset];
          }
          itemID.barrier(cl::sycl::access::fence_space::local_space);
        }
        /* The final result will be stored in local id 0. */
        if (localid == 0) {
          aI[itemID.get_group(0)] = scratch[localid];
          if((length<=local) && globalid ==0){
            auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
            aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
          }
        }
      }
    }
  };
 /// ReductionFunctor
 template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
  :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {
    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
    if (globalid< range) {
      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
      functor.finalize(accum);
      output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
    }
  }
 private:
  write_accessor output_accessor;
  ptrdiff_t out_offset;
  FunctorExpr functors;
  Tuple_of_Acc tuple_of_accessors;
  Dims dims;
  Op functor;
  Index range;
 };
 template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
 class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
  typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
    Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
  :output_accessor(output_accessor_),  out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {
    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
    if (globalid< range) {
      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
      functor.finalize(accum);
      output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
    }
  }
 private:
  write_accessor output_accessor;
  ptrdiff_t out_offset;
  FunctorExpr functors;
  Tuple_of_Acc tuple_of_accessors;
  Dims dims;
  Op functor;
  Index range;
  Index num_values_to_reduce;
 };
 template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
 class FullReductionKernelFunctor{
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  OutAccessor tmp_global_accessor;
  Index rng , remaining, red_factor;
  Op op;
  Dims dims;
  FunctorExpr functors;
  TupleType tuple_of_accessors;
  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
  void operator()(cl::sycl::nd_item<1> itemID) {
    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=itemID.get_global_linear_id();
    tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
    : static_cast<CoeffReturnType>(op.initialize());
    if(remaining!=0 && globalid==0 ){
      // this will add the rest of input buffer when the input size is not devidable to red_factor.
      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
      auto accum = op.initialize();
      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
      op.reduce(remaining_reduce, &accum);
      op.finalize(accum);
      tmp_global_accessor.get_pointer()[0]=accum;
    }
  }
 };
 template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr,  typename Dims, typename Index, typename TupleType>
 class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
  OutAccessor tmp_global_accessor;
  Index rng , remaining, red_factor;
  Op op;
  Dims dims;
  FunctorExpr functors;
  TupleType tuple_of_accessors;
  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
  void operator()(cl::sycl::nd_item<1> itemID) {
    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
    /// the device_evaluator is detectable and recognisable on the device.
    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
    /// const cast added as a naive solution to solve the qualifier drop error
    auto globalid=itemID.get_global_linear_id();
    auto scale = (rng*red_factor) + remaining;
    tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
    :static_cast<CoeffReturnType>(op.initialize())/scale;
    if(remaining!=0 && globalid==0 ){
      // this will add the rest of input buffer when the input size is not devidable to red_factor.
      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
      auto accum = op.initialize();
      tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
      op.reduce(remaining_reduce, &accum);
      op.finalize(accum);
      tmp_global_accessor.get_pointer()[0]=accum/scale;
    }
  }
 };
 }
 }
 }
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@ -1,213 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclLeafCount.h
 *
 * \brief:
 *  The leaf count used the pre-order expression tree traverse in order to name
 *  count the number of leaf nodes in the expression
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 /// \brief LeafCount used to counting terminal nodes. The total number of
 /// leaf nodes is used by MakePlaceHolderExprHelper to find the order
 /// of the leaf node in a expression tree at compile time.
 template <typename Expr>
 struct LeafCount;
 template<typename... Args> struct CategoryCount;
 template<> struct CategoryCount<>
 {
  static const size_t Count =0;
 };
 template<typename Arg, typename... Args>
 struct CategoryCount<Arg,Args...>{
  static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
 };
 /// specialisation of the \ref LeafCount struct when the node type is const TensorMap
 #define SYCLTENSORMAPLEAFCOUNT(CVQual)\
 template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\
 struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\
  static const size_t Count =1;\
 };
 SYCLTENSORMAPLEAFCOUNT(const)
 SYCLTENSORMAPLEAFCOUNT()
 #undef SYCLTENSORMAPLEAFCOUNT
 //  TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
 #define SYCLCATEGORYLEAFCOUNT(CVQual)\
 template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\
 struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
 SYCLCATEGORYLEAFCOUNT(const)
 SYCLCATEGORYLEAFCOUNT()
 #undef SYCLCATEGORYLEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
 #define SYCLSELECTOPLEAFCOUNT(CVQual)\
 template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
 struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
 SYCLSELECTOPLEAFCOUNT(const)
 SYCLSELECTOPLEAFCOUNT()
 #undef SYCLSELECTOPLEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp
 #define SYCLLEAFCOUNTASSIGNOP(CVQual)\
 template <typename LHSExpr, typename RHSExpr>\
 struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
 SYCLLEAFCOUNTASSIGNOP(const)
 SYCLLEAFCOUNTASSIGNOP()
 #undef SYCLLEAFCOUNTASSIGNOP
 /// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
 #define SYCLFORCEDEVALLEAFCOUNT(CVQual)\
 template <typename Expr>\
 struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\
    static const size_t Count =1;\
 };
 SYCLFORCEDEVALLEAFCOUNT(const)
 SYCLFORCEDEVALLEAFCOUNT()
 #undef SYCLFORCEDEVALLEAFCOUNT
 #define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
 template <typename CustomUnaryFunc, typename XprType>\
 struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
 static const size_t Count =1;\
 };
 SYCLCUSTOMUNARYOPLEAFCOUNT(const)
 SYCLCUSTOMUNARYOPLEAFCOUNT()
 #undef SYCLCUSTOMUNARYOPLEAFCOUNT
 #define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
 template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
 struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
 static const size_t Count =1;\
 };
 SYCLCUSTOMBINARYOPLEAFCOUNT( const)
 SYCLCUSTOMBINARYOPLEAFCOUNT()
 #undef SYCLCUSTOMBINARYOPLEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
 #define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\
 template <typename Expr>\
 struct LeafCount<CVQual ExprNode<Expr> > {\
  static const size_t Count = Num + CategoryCount<Expr>::Count;\
 };
 EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1)
 EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1)
 EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0)
 EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0)
 EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0)
 EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0)
 #undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
 #define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\
 template <typename OP, typename Dim, typename Expr>\
 struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\
    static const size_t Count =1;\
 };
 // TensorReductionOp
 REDUCTIONLEAFCOUNT(const,TensorReductionOp)
 REDUCTIONLEAFCOUNT(,TensorReductionOp)
 // tensor Argmax -TensorTupleReducerOp
 REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp)
 REDUCTIONLEAFCOUNT(, TensorTupleReducerOp)
 #undef REDUCTIONLEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
 #define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
 template <typename Indices, typename LhsXprType, typename RhsXprType>\
 struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
    static const size_t Count =1;\
 };
 CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
 CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
 CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
 CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
 #undef CONTRACTIONCONVOLUTIONLEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is  TensorSlicingOp
 #define SLICEOPLEAFCOUNT(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType>\
 struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{};
 SLICEOPLEAFCOUNT(const)
 SLICEOPLEAFCOUNT()
 #undef SLICEOPLEAFCOUNT
 /// specialisation of the \ref LeafCount struct when the node type is  TensorChippingOp
 #define CHIPPINGOPLEAFCOUNT(CVQual)\
 template <DenseIndex DimId, typename XprType>\
 struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
 CHIPPINGOPLEAFCOUNT(const)
 CHIPPINGOPLEAFCOUNT()
 #undef CHIPPINGOPLEAFCOUNT
 ///TensorStridingSlicingOp
 #define SLICESTRIDEOPLEAFCOUNT(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
 struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
 SLICESTRIDEOPLEAFCOUNT(const)
 SLICESTRIDEOPLEAFCOUNT()
 #undef SLICESTRIDEOPLEAFCOUNT
 //TensorImagePatchOp
 #define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
 struct LeafCount<CVQual TensorImagePatchOp<Rows, Cols, XprType> >:CategoryCount<XprType>{};
 TENSORIMAGEPATCHOPLEAFCOUNT(const)
 TENSORIMAGEPATCHOPLEAFCOUNT()
 #undef TENSORIMAGEPATCHOPLEAFCOUNT
 // TensorVolumePatchOp
 #define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
 struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{};
 TENSORVOLUMEPATCHOPLEAFCOUNT(const)
 TENSORVOLUMEPATCHOPLEAFCOUNT()
 #undef TENSORVOLUMEPATCHOPLEAFCOUNT
 } /// namespace TensorSycl
 } /// namespace internal
 } /// namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@ -1,302 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclPlaceHolderExpr.h
 *
 * \brief:
 *  This is the specialisation of the placeholder expression based on the
 * operation type
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 /// \struct PlaceHolder
 /// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
 /// tree.
 /// PlaceHolder contains the order of the leaf node in the expression tree.
 template <typename Scalar, size_t N>
 struct PlaceHolder {
  static constexpr size_t I = N;
  typedef Scalar Type;
 };
 /// \sttruct PlaceHolderExpression
 /// \brief it is used to create the PlaceHolder expression. The PlaceHolder
 /// expression is a copy of expression type in which the TensorMap of the has
 /// been replaced with PlaceHolder.
 template <typename Expr, size_t N>
 struct PlaceHolderExpression;
 template<size_t N, typename... Args>
 struct CalculateIndex;
 template<size_t N, typename Arg>
 struct CalculateIndex<N, Arg>{
  typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
  typedef utility::tuple::Tuple<ArgType> ArgsTuple;
 };
 template<size_t N, typename Arg1, typename Arg2>
 struct CalculateIndex<N, Arg1, Arg2>{
  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
  typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
  typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
  typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
 };
 template<size_t N, typename Arg1, typename Arg2, typename Arg3>
 struct CalculateIndex<N, Arg1, Arg2, Arg3> {
  static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
  typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
  typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
  typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
  typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
 };
 template<template<class...> class Category , class OP, class TPL>
 struct CategoryHelper;
 template<template<class...> class Category , class OP, class ...T >
 struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
  typedef Category<OP, T... > Type;
 };
 template<template<class...> class Category , class ...T >
 struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
  typedef Category<T... > Type;
 };
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp,  TensorCwiseTernaryOp
 #define OPEXPRCATEGORY(CVQual)\
 template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
 struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
  typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
 };
 OPEXPRCATEGORY(const)
 OPEXPRCATEGORY()
 #undef OPEXPRCATEGORY
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorCwiseSelectOp
 #define SELECTEXPR(CVQual)\
 template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
 struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
  typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
 };
 SELECTEXPR(const)
 SELECTEXPR()
 #undef SELECTEXPR
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorAssignOp
 #define ASSIGNEXPR(CVQual)\
 template <typename LHSExpr, typename RHSExpr, size_t N>\
 struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
  typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
 };
 ASSIGNEXPR(const)
 ASSIGNEXPR()
 #undef ASSIGNEXPR
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorMap
 #define TENSORMAPEXPR(CVQual)\
 template <typename T, int Options_, template <class> class MakePointer_, size_t N>\
 struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\
  typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\
 };
 TENSORMAPEXPR(const)
 TENSORMAPEXPR()
 #undef TENSORMAPEXPR
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorForcedEvalOp
 #define FORCEDEVAL(CVQual)\
 template <typename Expr, size_t N>\
 struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
  typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
 };
 FORCEDEVAL(const)
 FORCEDEVAL()
 #undef FORCEDEVAL
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorForcedEvalOp
 #define CUSTOMUNARYOPEVAL(CVQual)\
 template <typename CustomUnaryFunc, typename XprType, size_t N>\
 struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
  typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
 };
 CUSTOMUNARYOPEVAL(const)
 CUSTOMUNARYOPEVAL()
 #undef CUSTOMUNARYOPEVAL
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorForcedEvalOp
 #define CUSTOMBINARYOPEVAL(CVQual)\
 template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
 struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
  typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
 };
 CUSTOMBINARYOPEVAL(const)
 CUSTOMBINARYOPEVAL()
 #undef CUSTOMBINARYOPEVAL
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp
 #define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\
 template <typename Expr, size_t N>\
 struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\
  typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\
 };
 // TensorEvalToOp
 EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp)
 EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp)
 //TensorLayoutSwapOp
 EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp)
 EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp)
 //TensorIndexTupleOp
 EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp)
 EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp)
 #undef EVALTOLAYOUTSWAPINDEXTUPLE
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorChippingOp
 #define CHIPPINGOP(CVQual)\
 template <DenseIndex DimId, typename Expr, size_t N>\
 struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
  typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
 };
 CHIPPINGOP(const)
 CHIPPINGOP()
 #undef CHIPPINGOP
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorReductionOp and TensorTupleReducerOp (Argmax)
 #define SYCLREDUCTION(CVQual, ExprNode)\
 template <typename OP, typename Dims, typename Expr, size_t N>\
 struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\
  typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\
 };
 // tensor reduction
 SYCLREDUCTION(const, TensorReductionOp)
 SYCLREDUCTION(, TensorReductionOp)
 // tensor Argmax -TensorTupleReducerOp
 SYCLREDUCTION(const, TensorTupleReducerOp)
 SYCLREDUCTION(, TensorTupleReducerOp)
 #undef SYCLREDUCTION
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorReductionOp
 #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
 template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
 struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
  typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
 };
 SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
 SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
 SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
 SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
 #undef SYCLCONTRACTIONCONVOLUTIONPLH
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorCwiseSelectOp
 #define SLICEOPEXPR(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
 struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\
  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\
 };
 SLICEOPEXPR(const)
 SLICEOPEXPR()
 #undef SLICEOPEXPR
 #define SYCLSLICESTRIDEOPPLH(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\
 struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\
  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\
 };
 SYCLSLICESTRIDEOPPLH(const)
 SYCLSLICESTRIDEOPPLH()
 #undef SYCLSLICESTRIDEOPPLH
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorImagePatchOp
 #define SYCLTENSORIMAGEPATCHOP(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
 struct PlaceHolderExpression<CVQual TensorImagePatchOp<Rows, Cols, XprType>, N> {\
  typedef CVQual TensorImagePatchOp<Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
 };
 SYCLTENSORIMAGEPATCHOP(const)
 SYCLTENSORIMAGEPATCHOP()
 #undef SYCLTENSORIMAGEPATCHOP
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorVolumePatchOp
 #define SYCLTENSORVOLUMEPATCHOP(CVQual)\
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType, size_t N>\
 struct PlaceHolderExpression<CVQual TensorVolumePatchOp<Planes,Rows, Cols, XprType>, N> {\
  typedef CVQual TensorVolumePatchOp<Planes,Rows, Cols, typename CalculateIndex <N, XprType>::ArgType> Type;\
 };
 SYCLTENSORVOLUMEPATCHOP(const)
 SYCLTENSORVOLUMEPATCHOP()
 #undef SYCLTENSORVOLUMEPATCHOP
 /// template deduction for \ref PlaceHolderExpression struct
 template <typename Expr>
 struct createPlaceHolderExpression {
  static const size_t TotalLeaves = LeafCount<Expr>::Count;
  typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
 };
 }  // internal
 }  // TensorSycl
 }  // namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@ -1,96 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Cummins Chris PhD student at The University of Edinburgh.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensorSyclRun.h
 *
 * \brief:
 * Schedule_kernel invoke an specialised version of kernel struct. The
 * specialisation is based on the data dimension in sycl buffer
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
 namespace Eigen {
 namespace TensorSycl {
 template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
  typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
  typedef typename Expr::Index Index;
  FunctorExpr functors;
  TupleType tuple_of_accessors;
  Index range;
  ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_)
    : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){}
  void operator()(cl::sycl::nd_item<1> itemID) {
    typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
    auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
    auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
    typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
    if (gId < range)
      device_evaluator.evalScalar(gId);
  }
 };
 /// The run function in tensor sycl convert the expression tree to a buffer
 /// based expression tree;
 /// creates the expression tree for the device with accessor to buffers;
 /// construct the kernel and submit it to the sycl queue.
 /// std::array does not have TotalSize. So I have to get the size through template specialisation.
 template<typename , typename Dimensions> struct DimensionSize{
  static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
    return dim.TotalSize();
  }
 };
 #define DIMSIZEMACRO(CVQual)\
 template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
  static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
    return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
  }\
 };
 DIMSIZEMACRO(const)
 DIMSIZEMACRO()
 #undef DIMSIZEMACRO
 template <typename Expr, typename Dev>
 void run(Expr &expr, Dev &dev) {
  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  if (needs_assign) {
    typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
    FunctorExpr functors = internal::extractFunctors(evaluator);
    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
      // create a tuple of accessors from Evaluator
      typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
      TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
      typename Expr::Index range, GRange, tileSize;
      typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
      dev.parallel_for_setup(total_size, tileSize, range, GRange);
      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
      ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
        , functors, tuple_of_accessors
      ));
    });
      dev.asynchronousExec();
  }
  evaluator.cleanup();
 }
 }  // namespace TensorSycl
 }  // namespace Eigen
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@ -1,239 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TensroSyclTuple.h
 *
 * \brief:
 *  Minimal implementation of std::tuple that can be used inside a SYCL kernel.
 *
 *****************************************************************/
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
 namespace utility {
 namespace tuple {
 /// \struct StaticIf
 /// \brief The StaticIf struct is used to statically choose the type based on the
 /// condition.
 template <bool, typename T = void> struct StaticIf;
 /// \brief specialisation of the \ref StaticIf when the condition is true
 template <typename T>
 struct StaticIf<true, T> {
  typedef T type;
 };
 /// \struct Tuple
 /// \brief is a fixed-size collection of heterogeneous values
 /// \tparam Ts...	-	the types of the elements that the tuple stores.
 /// Empty list is supported.
 template <class... Ts>
 struct Tuple {};
 /// \brief specialisation of the \ref Tuple class when the tuple has at least
 /// one element.
 /// \tparam T : the type of the first element in the tuple.
 /// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
 template <class T, class... Ts>
 struct Tuple<T, Ts...> {
  Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
  T head;
  Tuple<Ts...> tail;
 };
 ///\ struct ElemTypeHolder
 /// \brief ElemTypeHolder class is used to specify the types of the
 /// elements inside the tuple
 /// \tparam size_t the number of elements inside the tuple
 /// \tparam class the tuple class
 template <size_t, class>
 struct ElemTypeHolder;
 /// \brief specialisation of the \ref ElemTypeHolder class when the number of
 /// elements inside the tuple is 1
 template <class T, class... Ts>
 struct ElemTypeHolder<0, Tuple<T, Ts...> > {
  typedef T type;
 };
 /// \brief specialisation of the \ref ElemTypeHolder class when the number of
 /// elements inside the tuple is bigger than 1. It recursively calls itself to
 /// detect the type of each element in the tuple
 /// \tparam T : the type of the first element in the tuple.
 /// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
 /// \tparam K is the Kth element in the tuple
 template <size_t k, class T, class... Ts>
 struct ElemTypeHolder<k, Tuple<T, Ts...> > {
  typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
 };
 /// get
 /// \brief Extracts the first element from the tuple.
 /// K=0 represents the first element of the tuple. The tuple cannot be empty.
 /// \tparam Ts... are the type of the elements in the tuple.
 /// \param t is the tuple whose contents to extract
 /// \return  typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
 #define TERMINATE_CONDS_TUPLE_GET(CVQual) \
 template <size_t k, class... Ts> \
 typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
 get(CVQual Tuple<Ts...> &t) { \
  static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
  return t.head; \
 }
 TERMINATE_CONDS_TUPLE_GET(const)
 TERMINATE_CONDS_TUPLE_GET()
 #undef TERMINATE_CONDS_TUPLE_GET
 /// get
 /// \brief Extracts the Kth element from the tuple.
 ///\tparam K is an integer value in [0,sizeof...(Types)).
 /// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
 /// \tparam Ts... are the type of the elements  in the tuple.
 /// \param t is the tuple whose contents to extract
 /// \return  typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
 #define RECURSIVE_TUPLE_GET(CVQual) \
 template <size_t k, class T, class... Ts> \
 typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
 get(CVQual Tuple<T, Ts...> &t) { \
  return utility::tuple::get<k - 1>(t.tail); \
 }
 RECURSIVE_TUPLE_GET(const)
 RECURSIVE_TUPLE_GET()
 #undef RECURSIVE_TUPLE_GET
 /// make_tuple
 /// \brief Creates a tuple object, deducing the target type from the types of
 /// arguments.
 /// \tparam Args the type of the arguments to construct the tuple from
 /// \param args zero or more arguments to construct the tuple from
 /// \return Tuple<Args...>
 template <typename... Args>
 Tuple<Args...> make_tuple(Args... args) {
  return Tuple<Args...>(args...);
 }
 /// size
 /// \brief Provides access to the number of elements in a tuple as a
 /// compile-time constant expression.
 /// \tparam Args the type of the arguments to construct the tuple from
 /// \return size_t
 template <typename... Args>
 static constexpr size_t size(Tuple<Args...> &) {
  return sizeof...(Args);
 }
 /// \struct IndexList
 /// \brief Creates a list of index from the elements in the tuple
 /// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
 template <size_t... Is>
 struct IndexList {};
 /// \struct RangeBuilder
 /// \brief Collects internal details for generating index ranges [MIN, MAX)
 /// Declare primary template for index range builder
 /// \tparam MIN is the starting index in the tuple
 /// \tparam N represents sizeof..(elements)- sizeof...(Is)
 /// \tparam Is... are the list of generated index so far
 template <size_t MIN, size_t N, size_t... Is>
 struct RangeBuilder;
 // FIXME Doxygen has problems with recursive inheritance
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 /// \brief base Step: Specialisation of the \ref RangeBuilder when the
 /// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
 /// \tparam MIN is the starting index of the tuple
 /// \tparam Is is [0 to sizeof...(tuple elements))
 template <size_t MIN, size_t... Is>
 struct RangeBuilder<MIN, MIN, Is...> {
  typedef IndexList<Is...> type;
 };
 /// Induction step: Specialisation of the RangeBuilder class when N!=MIN
 /// in this case we are recursively subtracting N by one and adding one
 /// index to Is... list until MIN==N
 /// \tparam MIN is the starting index in the tuple
 /// \tparam N represents sizeof..(elements)- sizeof...(Is)
 /// \tparam Is... are the list of generated index so far
 template <size_t MIN, size_t N, size_t... Is>
 struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
 #endif // EIGEN_PARSED_BY_DOXYGEN
 /// \brief IndexRange that returns a [MIN, MAX) index range
 /// \tparam MIN is the starting index in the tuple
 /// \tparam MAX is the size of the tuple
 template <size_t MIN, size_t MAX>
 struct IndexRange: RangeBuilder<MIN, MAX>::type {};
 /// append_base
 /// \brief unpacking the elements of the input tuple t and creating a new tuple
 /// by adding element a at the end of it.
 ///\tparam Args... the type of the elements inside the tuple t
 /// \tparam T the type of the new element going to be added at the end of tuple
 /// \tparam I... is the list of index from [0 to sizeof...(t))
 /// \param t the tuple on which we want to append a.
 /// \param a the new elements going to be added to the tuple
 /// \return Tuple<Args..., T>
 template <typename... Args, typename T, size_t... I>
 Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
  return utility::tuple::make_tuple(get<I>(t)..., a);
 }
 /// append
 /// \brief the deduction function for \ref append_base that automatically
 /// generate the \ref IndexRange
 ///\tparam Args... the type of the elements inside the tuple t
 /// \tparam T the type of the new element going to be added at the end of tuple
 /// \param t the tuple on which we want to append a.
 /// \param a the new elements going to be added to the tuple
 /// \return Tuple<Args..., T>
 template <typename... Args, typename T>
 Tuple<Args..., T> append(Tuple<Args...> t, T a) {
  return utility::tuple::append_base(t, a,  IndexRange<0, sizeof...(Args)>());
 }
 /// append_base
 /// \brief This is a specialisation of \ref append_base when we want to
 /// concatenate
 /// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
 /// IndexRange for each of them and create an output tuple T that contains both
 /// elements of t1 and t2.
 ///\tparam Args1... the type of the elements inside the tuple t1
 ///\tparam Args2... the type of the elements inside the tuple t2
 /// \tparam I1... is the list of index from [0 to sizeof...(t1))
 /// \tparam I2... is the list of index from [0 to sizeof...(t2))
 /// \param t1 is the tuple on which we want to append t2.
 /// \param t2 is the tuple that is going to be added on t1.
 /// \return Tuple<Args1..., Args2...>
 template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
 Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
  return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
 }
 /// append
 /// \brief deduction function for \ref append_base when we are appending tuple
 /// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
 /// automatically generated.
 ///\tparam Args1... the type of the elements inside the tuple t1
 ///\tparam Args2... the type of the elements inside the tuple t2
 /// \param t1 is the tuple on which we want to append t2.
 /// \param t2 is the tuple that is going to be added on t1.
 /// \return Tuple<Args1..., Args2...>
 template <typename... Args1, typename... Args2>
 Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
  return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
 }
 }  // tuple
 }  // utility
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
--- a/unsupported/doc/Overview.dox
+++ b/unsupported/doc/Overview.dox
@ -11,6 +11,8 @@ Click on the \e Modules tab at the top of this page to get a list of all unsuppo
 Don't miss the <a href="../index.html">official Eigen documentation</a>.
 \subpage SYCL_EIGEN "SYCL backend for Eigen"
 */
 /*
@ -26,3 +28,4 @@ subject to be included in %Eigen in the future.
 /// \internal \brief Namespace containing low-level routines from the %Eigen library.
 namespace internal {}
 }
--- a/unsupported/doc/SYCL.dox
+++ b/unsupported/doc/SYCL.dox
@ -0,0 +1,9 @@
 /** \page SYCL_EIGEN Eigen SYCL Backend
 Useful information for Eigen SYCL Backend:
 - <a href="https://developer.codeplay.com/computecppce/latest/getting-started-with-eigen">Getting Started with Eigen</a> 
 - <a href="https://developer.codeplay.com/computecppce/latest/options-for-building-eigen-sycl">Options for Building Eigen SYCL</a>  
 */
--- a/unsupported/doc/examples/CMakeLists.txt
+++ b/unsupported/doc/examples/CMakeLists.txt
@ -18,3 +18,7 @@ foreach(example_src ${examples_SRCS})
  )
  add_dependencies(unsupported_examples example_${example})
 endforeach(example_src)
 if(EIGEN_TEST_SYCL)
  add_subdirectory(SYCL)
 endif(EIGEN_TEST_SYCL)
--- a/unsupported/doc/examples/SYCL/CMakeLists.txt
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@ -0,0 +1,38 @@
 FILE(GLOB examples_SRCS "*.cpp")
 set(EIGEN_SYCL ON)
 list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
 if(EIGEN_SYCL_TRISYCL)
  set(CMAKE_CXX_STANDARD 14)
  set(STD_CXX_FLAG "-std=c++1z")
 else(EIGEN_SYCL_TRISYCL)
  if(MSVC)
    # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
    # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
    set(CMAKE_CXX_STANDARD 14)
    list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
  else()
    set(CMAKE_CXX_STANDARD 11)
    list(APPEND COMPUTECPP_USER_FLAGS -Wall)
  endif()
  # The following flags are not supported by Clang and can cause warnings
  # if used with -Werror so they are removed here.
  if(COMPUTECPP_USE_COMPILER_DRIVER)
    set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
    string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
    string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
    string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
  endif()
  list(APPEND COMPUTECPP_USER_FLAGS
      -DEIGEN_NO_ASSERTION_CHECKING=1
      -no-serial-memop
      -Xclang
      -cl-mad-enable)
 endif(EIGEN_SYCL_TRISYCL)
 FOREACH(example_src ${examples_SRCS})
  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
  ei_add_test_internal(${example} example_${example})
  ADD_DEPENDENCIES(unsupported_examples example_${example})
 ENDFOREACH(example_src)
 set(EIGEN_SYCL OFF)
--- a/unsupported/doc/examples/SYCL/CwiseMul.cpp
+++ b/unsupported/doc/examples/SYCL/CwiseMul.cpp
@ -0,0 +1,63 @@
 #include <iostream>
 #define EIGEN_USE_SYCL
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 int main()
 {
  using DataType = float;
  using IndexType = int64_t;
  constexpr auto DataLayout = Eigen::RowMajor;
  auto devices = Eigen::get_sycl_supported_devices();
  const auto device_selector = *devices.begin();
  Eigen::QueueInterface queueInterface(device_selector);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  // create the tensors to be used in the operation
  IndexType sizeDim1 = 3;
  IndexType sizeDim2 = 3;
  IndexType sizeDim3 = 3;
  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
  // initialize the tensors with the data we want manipulate to
  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
  // set up some random data in the tensors to be multiplied
  in1 = in1.random();
  in2 = in2.random();
  // allocate memory for the tensors
  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
  // 
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
  // copy the memory to the device and do the c=a*b calculation
  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.size())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
  // print out the results
   for (IndexType i = 0; i < sizeDim1; ++i) {
    for (IndexType j = 0; j < sizeDim2; ++j) {
      for (IndexType k = 0; k < sizeDim3; ++k) {
        std::cout << "device_out" << "(" << i << ", " << j << ", " << k << ") : " << out(i,j,k) 
                  << " vs host_out" << "(" << i << ", " << j << ", " << k << ") : " << in1(i,j,k) * in2(i,j,k) << "\n";
      }
    }
  }
  printf("c=a*b Done\n");
 }
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -111,40 +111,113 @@ ei_add_test(special_functions)
 if(EIGEN_TEST_CXX11)
  if(EIGEN_TEST_SYCL)
    set(EIGEN_SYCL ON)
    # Forward CMake options as preprocessor definitions
    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
    endif()
    if(EIGEN_SYCL_NO_LOCAL_MEM)
      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
    endif()
    if(EIGEN_SYCL_LOCAL_MEM)
      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
    endif()
    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
    endif()
    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
    endif()
    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
    endif()
    if(EIGEN_SYCL_REG_M)
      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
    endif()
    if(EIGEN_SYCL_REG_N)
      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
    endif()
    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
    endif()
    if(EIGEN_SYCL_ASYNC_EXECUTION)
      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
    endif()
    if(EIGEN_SYCL_DISABLE_SKINNY)
      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
    endif()
    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
  endif()
    if(EIGEN_SYCL_DISABLE_RANK1)
      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
    endif()
    if(EIGEN_SYCL_DISABLE_SCALAR)
      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
    endif()
    if(EIGEN_SYCL_DISABLE_GEMV)
      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
    endif()
    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
    endif()
    if(EIGEN_SYCL_TRISYCL)
      set(CMAKE_CXX_STANDARD 14)
      set(STD_CXX_FLAG "-std=c++1z")
    else()
-      # It should be safe to always run these tests as there is some fallback code for
+      if(MSVC)
-      # older compiler that don't support cxx11.
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
-      # This is already set if EIGEN_TEST_CXX11 is enabled:
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
-      # set(CMAKE_CXX_STANDARD 11)
+        set(CMAKE_CXX_STANDARD 14)
-      # set(STD_CXX_FLAG "-std=c++11")
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
      else()
        set(CMAKE_CXX_STANDARD 11)
        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
      endif()
      # The following flags are not supported by Clang and can cause warnings
      # if used with -Werror so they are removed here.
      if(COMPUTECPP_USE_COMPILER_DRIVER)
        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
      endif()
      list(APPEND COMPUTECPP_USER_FLAGS
          -DEIGEN_NO_ASSERTION_CHECKING=1
          -no-serial-memop
          -Xclang
          -cl-mad-enable)
    endif()
-    ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
-    ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
    set(EIGEN_SYCL OFF)
  endif()
  ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@ -18,6 +18,7 @@
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::array;
@ -26,9 +27,8 @@ using Eigen::Tensor;
 using Eigen::TensorMap;
 template <typename DataType, int Layout, typename DenseIndex>
-static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
-
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}});
  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
  in.setRandom();
@ -39,14 +39,15 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
  std::size_t in_bytes = in.size() * sizeof(DataType);
  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
-  DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
  DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
  DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}});
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
-  sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
  gpu_out_max.device(sycl_device) = gpu_in.argmax();
  gpu_out_min.device(sycl_device) = gpu_in.argmin();
@ -54,7 +55,7 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
  sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
  sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
-  VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1);
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
  VERIFY_IS_EQUAL(out_min(), 0);
  sycl_device.deallocate(d_in);
@ -62,22 +63,22 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
  sycl_device.deallocate(d_out_min);
 }
 template <typename DataType, int DataLayout, typename DenseIndex>
-static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
-{
+  DenseIndex sizeDim0 = 9;
-  DenseIndex sizeDim0=9;
+  DenseIndex sizeDim1 = 3;
-  DenseIndex sizeDim1=3;
+  DenseIndex sizeDim2 = 5;
-  DenseIndex sizeDim2=5;
+  DenseIndex sizeDim3 = 7;
-  DenseIndex sizeDim3=7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
  std::vector<DenseIndex> dims;
-  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  dims.push_back(sizeDim0);
  dims.push_back(sizeDim1);
  dims.push_back(sizeDim2);
  dims.push_back(sizeDim3);
  for (DenseIndex dim = 0; dim < 4; ++dim) {
    array<DenseIndex, 3> out_shape;
-    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
@ -86,9 +87,13 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
      for (DenseIndex j = 0; j < sizeDim1; ++j) {
        for (DenseIndex k = 0; k < sizeDim2; ++k) {
          for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            ix[0] = i;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            ix[1] = j;
-            tensor(ix)=(ix[dim] != 0)?-1.0:10.0;
+            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
            // = 10.0
            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
          }
        }
      }
@ -97,23 +102,23 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
    std::size_t in_bytes = tensor.size() * sizeof(DataType);
    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
-    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
-                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
      // Expect max to be in the first index of the reduced dimension
-       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
    }
    sycl_device.synchronize();
@ -122,15 +127,18 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
      for (DenseIndex j = 0; j < sizeDim1; ++j) {
        for (DenseIndex k = 0; k < sizeDim2; ++k) {
          for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            ix[0] = i;
            ix[1] = j;
            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0;
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
          }
        }
      }
    }
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
@ -144,20 +152,21 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
 }
 template <typename DataType, int DataLayout, typename DenseIndex>
-static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
-{
+  DenseIndex sizeDim0 = 9;
-  DenseIndex sizeDim0=9;
+  DenseIndex sizeDim1 = 3;
-  DenseIndex sizeDim1=3;
+  DenseIndex sizeDim2 = 5;
-  DenseIndex sizeDim2=5;
+  DenseIndex sizeDim3 = 7;
-  DenseIndex sizeDim3=7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
  std::vector<DenseIndex> dims;
-  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  dims.push_back(sizeDim0);
  dims.push_back(sizeDim1);
  dims.push_back(sizeDim2);
  dims.push_back(sizeDim3);
  for (DenseIndex dim = 0; dim < 4; ++dim) {
    array<DenseIndex, 3> out_shape;
-    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
@ -166,9 +175,12 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
      for (DenseIndex j = 0; j < sizeDim1; ++j) {
        for (DenseIndex k = 0; k < sizeDim2; ++k) {
          for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            ix[0] = i;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            ix[1] = j;
-            tensor(ix)=(ix[dim] != 0)?1.0:-10.0;
+            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
          }
        }
      }
@ -177,23 +189,23 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
    std::size_t in_bytes = tensor.size() * sizeof(DataType);
    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
-    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
-    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
-                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
      // Expect max to be in the first index of the reduced dimension
-       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
    }
    sycl_device.synchronize();
@ -202,15 +214,18 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
      for (DenseIndex j = 0; j < sizeDim1; ++j) {
        for (DenseIndex k = 0; k < sizeDim2; ++k) {
          for (DenseIndex l = 0; l < sizeDim3; ++l) {
-            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            ix[0] = i;
-            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            ix[1] = j;
-            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0;
+            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
          }
        }
      }
    }
-    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
@ -223,10 +238,8 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
  }
 }
-
+template <typename DataType, typename Device_Selector>
-
+void sycl_argmax_test_per_device(const Device_Selector& d) {
 template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
@ -238,8 +251,7 @@ template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@ -25,243 +25,330 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
-namespace std {
+// Functions used to compare the TensorMap implementation on the device with
-template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); }
+// the equivalent on the host
 namespace cl {
 namespace sycl {
 template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
-template <typename T> T inverse(T x) { return 1 / x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
 template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
 template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
 }
 }
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout)         \
+struct EqualAssignement {
-  {                                                                            \
+  template <typename Lhs, typename Rhs>
-    /* out OPERATOR in.FUNC() */                                               \
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
-    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
+};
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
+
-    in = in.random() + static_cast<SCALAR>(0.01);                              \
+struct PlusEqualAssignement {
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
+  template <typename Lhs, typename Rhs>
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
-    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
+};
-        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
+
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
+template <typename DataType, int DataLayout,
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
+          typename Assignement, typename Operator>
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
+                                    const array<int64_t, 3>& tensor_range) {
-    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
+  Operator op;
-                                   (in.size()) * sizeof(SCALAR));              \
+  Assignement asgn;
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
+  {
-                                   (out.size()) * sizeof(SCALAR));             \
+    /* Assignement(out, Operator(in)) */
-    gpu_out.device(sycl_device) OPERATOR gpu.FUNC();                           \
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
-                                   (out.size()) * sizeof(SCALAR));             \
+    in = in.random() + DataType(0.01);
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
+    out = out.random() + DataType(0.01);
-      SCALAR ver = reference(i);                                               \
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
-      ver OPERATOR std::FUNC(in(i));                                           \
+    DataType *gpu_data = static_cast<DataType *>(
-      VERIFY_IS_APPROX(out(i), ver);                                           \
+        sycl_device.allocate(in.size() * sizeof(DataType)));
-    }                                                                          \
+    DataType *gpu_data_out = static_cast<DataType *>(
-    sycl_device.deallocate(gpu_data);                                          \
+        sycl_device.allocate(out.size() * sizeof(DataType)));
-    sycl_device.deallocate(gpu_data_out);                                      \
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
-  }                                                                            \
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
-  {                                                                            \
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
-    /* out OPERATOR out.FUNC() */                                              \
+                                   (in.size()) * sizeof(DataType));
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
-    out = out.random() + static_cast<SCALAR>(0.01);                            \
+                                   (out.size()) * sizeof(DataType));
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
+    auto device_expr = gpu_out.device(sycl_device);
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
+    asgn(device_expr, op(gpu));
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
+                                   (out.size()) * sizeof(DataType));
-    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
+    for (int64_t i = 0; i < out.size(); ++i) {
-                                   (out.size()) * sizeof(SCALAR));             \
+      DataType ver = reference(i);
-    gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC();                       \
+      asgn(ver, op(in(i)));
-    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
+      VERIFY_IS_APPROX(out(i), ver);
-                                   (out.size()) * sizeof(SCALAR));             \
+    }
-    for (int64_t i = 0; i < out.size(); ++i) {                                 \
+    sycl_device.deallocate(gpu_data);
-      SCALAR ver = reference(i);                                               \
+    sycl_device.deallocate(gpu_data_out);
      ver OPERATOR std::FUNC(reference(i));                                    \
      VERIFY_IS_APPROX(out(i), ver);                                           \
    }                                                                          \
    sycl_device.deallocate(gpu_data_out);                                      \
  }
-
+  {
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout)                \
+    /* Assignement(out, Operator(out)) */
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
-  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout)              \
+    out = out.random() + DataType(0.01);
-  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout)             \
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
-  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout)            \
+    DataType *gpu_data_out = static_cast<DataType *>(
-  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout)              \
+        sycl_device.allocate(out.size() * sizeof(DataType)));
-  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout)           \
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
-  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout)              \
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
-  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout)               \
+                                   (out.size()) * sizeof(DataType));
-  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout)             \
+    auto device_expr = gpu_out.device(sycl_device);
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout)               \
+    asgn(device_expr, op(gpu_out));
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
-  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout)              \
+                                   (out.size()) * sizeof(DataType));
-  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout)             \
+    for (int64_t i = 0; i < out.size(); ++i) {
-  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout)             \
+      DataType ver = reference(i);
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
+      asgn(ver, op(reference(i)));
-
+      VERIFY_IS_APPROX(out(i), ver);
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout)                        \
+    }
-  {                                                                            \
+    sycl_device.deallocate(gpu_data_out);
    /* out = in.FUNC() */                                                      \
    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
    Tensor<bool, 3, Layout, int64_t> out(tensorRange);                         \
    in = in.random() + static_cast<SCALAR>(0.01);                              \
    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
    bool *gpu_data_out =                                                       \
        static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));  \
    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
    TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);    \
    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
                                   (in.size()) * sizeof(SCALAR));              \
    gpu_out.device(sycl_device) = gpu.FUNC();                                  \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(bool));               \
    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_EQUAL(out(i), std::FUNC(in(i)));                               \
    }                                                                          \
    sycl_device.deallocate(gpu_data);                                          \
    sycl_device.deallocate(gpu_data_out);                                      \
  }
 }
-#define TEST_UNARY_BUILTINS(SCALAR, Layout)                                    \
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout)                             \
+  struct op_##FUNC {                                               \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout)                              \
+    template <typename T>                                          \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout)                             \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout)                          \
+      return cl::sycl::FUNC(x);                                    \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
+    }                                                              \
    template <typename T>                                          \
    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
      return x.FUNC();                                             \
    }                                                              \
  };
 DECLARE_UNARY_STRUCT(abs)
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
 DECLARE_UNARY_STRUCT(square)
 DECLARE_UNARY_STRUCT(cube)
 DECLARE_UNARY_STRUCT(inverse)
 DECLARE_UNARY_STRUCT(tanh)
 DECLARE_UNARY_STRUCT(exp)
 DECLARE_UNARY_STRUCT(expm1)
 DECLARE_UNARY_STRUCT(log)
 DECLARE_UNARY_STRUCT(ceil)
 DECLARE_UNARY_STRUCT(floor)
 DECLARE_UNARY_STRUCT(round)
 DECLARE_UNARY_STRUCT(log1p)
 DECLARE_UNARY_STRUCT(sign)
 DECLARE_UNARY_STRUCT(isnan)
 DECLARE_UNARY_STRUCT(isfinite)
 DECLARE_UNARY_STRUCT(isinf)
 template <typename DataType, int DataLayout, typename Assignement>
 void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
                                         const array<int64_t, 3>& tensor_range) {
 #define RUN_UNARY_TEST(FUNC) \
  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
                                 op_##FUNC>(sycl_device, tensor_range)
  RUN_UNARY_TEST(abs);
  RUN_UNARY_TEST(sqrt);
  RUN_UNARY_TEST(rsqrt);
  RUN_UNARY_TEST(square);
  RUN_UNARY_TEST(cube);
  RUN_UNARY_TEST(inverse);
  RUN_UNARY_TEST(tanh);
  RUN_UNARY_TEST(exp);
  RUN_UNARY_TEST(expm1);
  RUN_UNARY_TEST(log);
  RUN_UNARY_TEST(ceil);
  RUN_UNARY_TEST(floor);
  RUN_UNARY_TEST(round);
  RUN_UNARY_TEST(log1p);
  RUN_UNARY_TEST(sign);
 }
 template <typename DataType, int DataLayout, typename Operator>
 void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
                                     const array<int64_t, 3>& tensor_range) {
  /* out = op(in) */
  Operator op;
  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
  in = in.random() + DataType(0.01);
  DataType *gpu_data = static_cast<DataType *>(
      sycl_device.allocate(in.size() * sizeof(DataType)));
  bool *gpu_data_out =
      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
                                 (in.size()) * sizeof(DataType));
  gpu_out.device(sycl_device) = op(gpu);
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
                                 (out.size()) * sizeof(bool));
  for (int64_t i = 0; i < out.size(); ++i) {
    VERIFY_IS_EQUAL(out(i), op(in(i)));
  }
  sycl_device.deallocate(gpu_data);
  sycl_device.deallocate(gpu_data_out);
 }
 template <typename DataType, int DataLayout>
 void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
                         const array<int64_t, 3>& tensor_range) {
  test_unary_builtins_for_assignement<DataType, DataLayout,
                                      PlusEqualAssignement>(sycl_device, tensor_range);
  test_unary_builtins_for_assignement<DataType, DataLayout,
                                      EqualAssignement>(sycl_device, tensor_range);
  test_unary_builtins_return_bool<DataType, DataLayout,
                                  op_isnan>(sycl_device, tensor_range);
  test_unary_builtins_return_bool<DataType, DataLayout,
                                  op_isfinite>(sycl_device, tensor_range);
  test_unary_builtins_return_bool<DataType, DataLayout,
                                  op_isinf>(sycl_device, tensor_range);
 }
 template <typename DataType>
 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
  int64_t sizeDim1 = 10;
  int64_t sizeDim2 = 10;
  int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_UNARY_BUILTINS(float, RowMajor)
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
-  TEST_UNARY_BUILTINS(float, ColMajor)
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
 }
-namespace std {
+template <typename DataType, int DataLayout, typename Operator>
-template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
-template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
+                               const array<int64_t, 3>& tensor_range) {
  /* out = op(in_1, in_2) */
  Operator op;
  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
  in_1 = in_1.random() + DataType(0.01);
  in_2 = in_2.random() + DataType(0.01);
  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
  DataType *gpu_data_1 = static_cast<DataType *>(
      sycl_device.allocate(in_1.size() * sizeof(DataType)));
  DataType *gpu_data_2 = static_cast<DataType *>(
      sycl_device.allocate(in_2.size() * sizeof(DataType)));
  DataType *gpu_data_out = static_cast<DataType *>(
      sycl_device.allocate(out.size() * sizeof(DataType)));
  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
                                 (in_1.size()) * sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
                                 (in_2.size()) * sizeof(DataType));
  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
                                 (out.size()) * sizeof(DataType));
  for (int64_t i = 0; i < out.size(); ++i) {
    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
  }
  sycl_device.deallocate(gpu_data_1);
  sycl_device.deallocate(gpu_data_2);
  sycl_device.deallocate(gpu_data_out);
 }
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout)                        \
+template <typename DataType, int DataLayout, typename Operator>
-  {                                                                            \
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
-    /* out = in_1.FUNC(in_2) */                                                \
+                                     const array<int64_t, 3>& tensor_range) {
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+  /* out = op(in_1, 2) */
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
+  Operator op;
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
+  const DataType arg2(2);
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
+  in_1 = in_1.random();
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
+  DataType *gpu_data_1 = static_cast<DataType *>(
-    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
-        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
+  DataType *gpu_data_out = static_cast<DataType *>(
-    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
+      sycl_device.allocate(out.size() * sizeof(DataType)));
-        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
-    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
+                                 (in_1.size()) * sizeof(DataType));
-    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
-                                   (in_1.size()) * sizeof(SCALAR));            \
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
-    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
+                                 (out.size()) * sizeof(DataType));
-                                   (in_2.size()) * sizeof(SCALAR));            \
+  for (int64_t i = 0; i < out.size(); ++i) {
-    gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2);                           \
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver = std::FUNC(in_1(i), in_2(i));                                       \
      VERIFY_IS_APPROX(out(i), ver);                                           \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
    sycl_device.deallocate(gpu_data_2);                                        \
    sycl_device.deallocate(gpu_data_out);                                      \
  }
  sycl_device.deallocate(gpu_data_1);
  sycl_device.deallocate(gpu_data_out);
 }
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout)               \
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
-  {                                                                            \
+  struct op_##FUNC {                                                                         \
-    /* out = in_1 OPERATOR in_2 */                                             \
+    template <typename T1, typename T2>                                                      \
-    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
-    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
+      return cl::sycl::FUNC(x, y);                                                           \
-    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
+    }                                                                                        \
-    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
+    template <typename T1, typename T2>                                                      \
-    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
-    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
+      return x.FUNC(y);                                                                      \
-    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
+    }                                                                                        \
-        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
+  };
    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
                                   (in_2.size()) * sizeof(SCALAR));            \
    gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2;                        \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i));                      \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
    sycl_device.deallocate(gpu_data_2);                                        \
    sycl_device.deallocate(gpu_data_out);                                      \
  }
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout)     \
+DECLARE_BINARY_STRUCT(cwiseMax)
-  {                                                                            \
+DECLARE_BINARY_STRUCT(cwiseMin)
    /* out = in_1 OPERATOR 2 */                                                \
    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    gpu_out.device(sycl_device) = gpu_1 OPERATOR 2;                            \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2);                            \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
    sycl_device.deallocate(gpu_data_out);                                      \
  }
-#define TEST_BINARY_BUILTINS(SCALAR, Layout)                                   \
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout)                         \
+  struct op_##NAME {                                                      \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout)                         \
+    template <typename T1, typename T2>                                   \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout)                           \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout)                           \
+      return x OPERATOR y;                                                \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout)                           \
+    }                                                                     \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
+  };
-static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+DECLARE_BINARY_STRUCT_OP(plus, +)
 DECLARE_BINARY_STRUCT_OP(minus, -)
 DECLARE_BINARY_STRUCT_OP(times, *)
 DECLARE_BINARY_STRUCT_OP(divide, /)
 DECLARE_BINARY_STRUCT_OP(modulo, %)
 template <typename DataType, int DataLayout>
 void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
                          const array<int64_t, 3>& tensor_range) {
  test_binary_builtins_func<DataType, DataLayout,
                            op_cwiseMax>(sycl_device, tensor_range);
  test_binary_builtins_func<DataType, DataLayout,
                            op_cwiseMin>(sycl_device, tensor_range);
  test_binary_builtins_func<DataType, DataLayout,
                            op_plus>(sycl_device, tensor_range);
  test_binary_builtins_func<DataType, DataLayout,
                            op_minus>(sycl_device, tensor_range);
  test_binary_builtins_func<DataType, DataLayout,
                            op_times>(sycl_device, tensor_range);
  test_binary_builtins_func<DataType, DataLayout,
                            op_divide>(sycl_device, tensor_range);
 }
 template <typename DataType>
 static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
  int64_t sizeDim1 = 10;
  int64_t sizeDim2 = 10;
  int64_t sizeDim3 = 10;
-  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_BINARY_BUILTINS(float, RowMajor)
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
-  TEST_BINARY_BUILTINS(float, ColMajor)
+}
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
+
 template <typename DataType>
 static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
  int64_t sizeDim1 = 10;
  int64_t sizeDim2 = 10;
  int64_t sizeDim3 = 10;
  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
  test_binary_builtins_fixed_arg2<DataType, RowMajor,
                                  op_modulo>(sycl_device, tensor_range);
  test_binary_builtins_fixed_arg2<DataType, ColMajor,
                                  op_modulo>(sycl_device, tensor_range);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    QueueInterface queueInterface(device);
    Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
-    CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
  }
 }
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@ -419,6 +419,7 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
  const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
  DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
  DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
@ -605,14 +606,14 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
-  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
  test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
-  test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
 {
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
--- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@ -80,6 +80,8 @@ static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
      VERIFY_IS_EQUAL(out(i, j), 0);
    }
  }
  sycl_device.deallocate(gpu_in1_data);
 sycl_device.deallocate(gpu_out_data);
 }
 template<typename TensorType>
@ -147,6 +149,9 @@ static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
      }
    }
  }
  sycl_device.deallocate(gpu_in1_data);
  sycl_device.deallocate(gpu_in2_data);
  sycl_device.deallocate(gpu_out_data);
 }
 template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@ -36,8 +36,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
-  in1 = in1.random() + in1.constant(10.0f);
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
-  in2 = in2.random() + in2.constant(10.0f);
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
  // creating TensorMap from tensor
  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
@ -72,5 +72,6 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev
 EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@ -0,0 +1,103 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 // Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 using Eigen::Tensor;
 using Eigen::RowMajor;
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
 {
  IndexType sizeDim1 = 245;
  IndexType sizeDim2 = 343;
  IndexType sizeDim3 = 577;
  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
  typedef Eigen::DSizes<IndexType, 3> Index3;
  Index3 strides1(1L,1L, 1L);
  Index3 indicesStart1(1L, 0L, 0L);
  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
  Index3 strides2(1L,1L, 1L);
  Index3 indicesStart2(0L, 0L, 0L);
  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
  tensor1.setRandom();
  tensor2.setRandom();
  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
  for (IndexType i = 0; i <slice_range[0] ; ++i) {
    for (IndexType j = 0; j < slice_range[1]; ++j) {
      for (IndexType k = 0; k < slice_range[2]; ++k) {
        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
      }
    }
  }
  sycl_device.deallocate(gpu_data1);
  sycl_device.deallocate(gpu_data2);
  sycl_device.deallocate(gpu_data3);
 }
 template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
 #endif
  }
 }
--- a/unsupported/test/cxx11_tensor_math_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@ -0,0 +1,105 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 // Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 using Eigen::Tensor;
 using Eigen::RowMajor;
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
 {
  IndexType sizeDim1 = 4;
  IndexType sizeDim2 = 4;
  IndexType sizeDim3 = 1;
  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
  in = in.random();
  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
  gpu2.device(sycl_device) = gpu1.tanh();
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
  out_cpu=in.tanh();
  for (int i = 0; i < in.size(); ++i) {
    VERIFY_IS_APPROX(out(i), out_cpu(i));
  }
 }
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
 {
  IndexType sizeDim1 = 4;
  IndexType sizeDim2 = 4;
  IndexType sizeDim3 = 1;
  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
  in = in.random();
  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
  gpu2.device(sycl_device) = gpu1.sigmoid();
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
  out_cpu=in.sigmoid();
  for (int i = 0; i < in.size(); ++i) {
    VERIFY_IS_APPROX(out(i), out_cpu(i));
  }
 }
 template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@ -180,6 +180,82 @@ static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
  sycl_device.deallocate(gpu_data3);
 }
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
 {
  IndexType sizeDim1 = 2;
  IndexType sizeDim2 = 3;
  IndexType sizeDim3 = 5;
  IndexType sizeDim4 = 7;
  IndexType sizeDim5 = 11;
  typedef Eigen::DSizes<IndexType, 5> Index5;
  Index5 strides(1L,1L,1L,1L,1L);
  Index5 indicesStart(1L,2L,3L,4L,5L);
  Index5 indicesStop(2L,3L,4L,5L,6L);
  Index5 lengths(1L,1L,1L,1L,1L);
  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
  tensor.setRandom();
  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
  Index5 strides2(1L,1L,1L,1L,1L);
  Index5 indicesStart2(1L,1L,3L,4L,5L);
  Index5 indicesStop2(2L,2L,5L,6L,8L);
  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
  for (IndexType i = 0; i < 2; ++i) {
    for (IndexType j = 0; j < 2; ++j) {
      for (IndexType k = 0; k < 3; ++k) {
        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
      }
    }
  }
  sycl_device.deallocate(gpu_data1);
  sycl_device.deallocate(gpu_data2);
  sycl_device.deallocate(gpu_data3);
 }
 template<typename DataType, int DataLayout, typename IndexType>
 static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
 {
@ -228,6 +304,65 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
  sycl_device.deallocate(gpu_data3);
 }
 template <typename OutIndex, typename DSizes>
 Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
  Eigen::array<OutIndex, DSizes::count> out;
  for (int i = 0; i < DSizes::count; ++i) {
    out[i] = in[i];
  }
  return out;
 }
 template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
 int run_eigen(const SyclDevice& sycl_device) {
  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
  using TensorMI64 = TensorMap<TensorI64>;
  using TensorMI32 = TensorMap<TensorI32>;
  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
  TensorI64 out_tensor_gpu(tensor_range);
  TensorI64 out_tensor_cpu(tensor_range);
  out_tensor_cpu.setRandom();
  TensorI64 sub_tensor(slice_range);
  sub_tensor.setRandom();
  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
  TensorMI64 out_gpu(out_gpu_data, tensor_range);
  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
  int has_err = 0;
  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
    auto exp = out_tensor_cpu(i);
    auto val = out_tensor_gpu(i);
    if (val != exp) {
      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
      has_err = 1;
    }
  }
  sycl_device.deallocate(out_gpu_data);
  sycl_device.deallocate(sub_gpu_data);
  return has_err;
 }
 template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
@ -239,6 +374,9 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
  run_eigen<float, RowMajor, long, int>(sycl_device); 
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
--- a/unsupported/test/cxx11_tensor_random_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@ -0,0 +1,100 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
 {
  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
  out.setZero();
  std::size_t out_bytes = out.size() * sizeof(DataType);
  IndexType sizeDim0 = 72;
  IndexType sizeDim1 = 97;
  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
  gpu_out.device(sycl_device)=gpu_out.random();
  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
  for(IndexType i=1; i<sizeDim0; i++)
    for(IndexType j=1; j<sizeDim1; j++)
    {
      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
  // For now we just check thes code doesn't crash.
  // TODO: come up with a valid test of randomness
  sycl_device.deallocate(d_out);
 }
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
 {
  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
  out.setZero();
  std::size_t out_bytes = out.size() * sizeof(DataType);
  IndexType sizeDim0 = 72;
  IndexType sizeDim1 = 97;
  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
  gpu_out.device(sycl_device)=gpu_out.random(gen);
  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
  for(IndexType i=1; i<sizeDim0; i++)
    for(IndexType j=1; j<sizeDim1; j++)
    {
      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
    }
  // For now we just check thes code doesn't crash.
  // TODO: come up with a valid test of randomness
  sycl_device.deallocate(d_out);
 }
 template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
 #endif
  }
 }
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@ -20,10 +20,8 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
  IndexType dim1 = 2;
  IndexType dim2 = 3;
  IndexType dim3 = 5;
@ -40,21 +38,30 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  dim_rev[2] = true;
  dim_rev[3] = false;
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_in_data = static_cast<DataType*>(
-  DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu(gpu_out_data, tensorRange);
+                                                                tensorRange);
  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
                                                                 tensorRange);
-  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
      gpu_in_data, tensor.data(),
      (tensor.dimensions().TotalSize()) * sizeof(DataType));
  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
      reversed_tensor.data(), gpu_out_data,
      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
  // Check that the CPU and GPU reductions return the same result.
  for (IndexType i = 0; i < 2; ++i) {
    for (IndexType j = 0; j < 3; ++j) {
      for (IndexType k = 0; k < 5; ++k) {
        for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
                          reversed_tensor(i, 2 - j, 4 - k, l));
        }
      }
    }
@ -65,13 +72,15 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  dim_rev[3] = false;
  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
      reversed_tensor.data(), gpu_out_data,
      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
  for (IndexType i = 0; i < 2; ++i) {
    for (IndexType j = 0; j < 3; ++j) {
      for (IndexType k = 0; k < 5; ++k) {
        for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
        }
      }
    }
@ -82,13 +91,16 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  dim_rev[2] = false;
  dim_rev[3] = true;
  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
-  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
      reversed_tensor.data(), gpu_out_data,
      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
  for (IndexType i = 0; i < 2; ++i) {
    for (IndexType j = 0; j < 3; ++j) {
      for (IndexType k = 0; k < 5; ++k) {
        for (IndexType l = 0; l < 7; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
                          reversed_tensor(1 - i, j, k, 6 - l));
        }
      }
    }
@ -98,11 +110,9 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  sycl_device.deallocate(gpu_out_data);
 }
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue)
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
-{
+                              bool LValue) {
  IndexType dim1 = 2;
  IndexType dim2 = 3;
  IndexType dim3 = 5;
@ -120,24 +130,32 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  dim_rev[2] = false;
  dim_rev[3] = true;
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_in_data = static_cast<DataType*>(
-  DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
-  DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
      expected.dimensions().TotalSize() * sizeof(DataType)));
  DataType* gpu_out_data_result = static_cast<DataType*>(
      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_expected(gpu_out_data_expected, tensorRange);
+                                                                tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_result(gpu_out_data_result, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
      gpu_out_data_expected, tensorRange);
  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
      gpu_out_data_result, tensorRange);
-
+  sycl_device.memcpyHostToDevice(
-  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
+      gpu_in_data, tensor.data(),
      (tensor.dimensions().TotalSize()) * sizeof(DataType));
  if (LValue) {
    out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
  } else {
    out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
  }
-  sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
-
+      expected.data(), gpu_out_data_expected,
      expected.dimensions().TotalSize() * sizeof(DataType));
  array<IndexType, 4> src_slice_dim;
  src_slice_dim[0] = 2;
@ -154,8 +172,9 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  for (IndexType i = 0; i < 5; ++i) {
    if (LValue) {
-        out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
-          in_gpu.slice(src_slice_start, src_slice_dim);
+          .reverse(dim_rev)
          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
    } else {
      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
          in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
@ -163,13 +182,15 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
    src_slice_start[2] += 1;
    dst_slice_start[2] += 1;
  }
-  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
      result.data(), gpu_out_data_result,
      result.dimensions().TotalSize() * sizeof(DataType));
  for (IndexType i = 0; i < expected.dimension(0); ++i) {
    for (IndexType j = 0; j < expected.dimension(1); ++j) {
      for (IndexType k = 0; k < expected.dimension(2); ++k) {
        for (IndexType l = 0; l < expected.dimension(3); ++l) {
-          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
        }
      }
    }
@ -177,34 +198,37 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  dst_slice_start[2] = 0;
  result.setRandom();
-  sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
      gpu_out_data_result, result.data(),
      (result.dimensions().TotalSize()) * sizeof(DataType));
  for (IndexType i = 0; i < 5; ++i) {
-     if (LValue) {
+    if (LValue) {
-       out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
-           in_gpu.slice(dst_slice_start, dst_slice_dim);
+          .reverse(dim_rev)
-     } else {
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
-       out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+    } else {
-           in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
-     }
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
    }
    dst_slice_start[2] += 1;
  }
-  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(
      result.data(), gpu_out_data_result,
      result.dimensions().TotalSize() * sizeof(DataType));
  for (IndexType i = 0; i < expected.dimension(0); ++i) {
    for (IndexType j = 0; j < expected.dimension(1); ++j) {
      for (IndexType k = 0; k < expected.dimension(2); ++k) {
        for (IndexType l = 0; l < expected.dimension(3); ++l) {
-          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
        }
      }
    }
  }
 }
-
+template <typename DataType>
-
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
 template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
@ -215,7 +239,15 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de
  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
-  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
-    CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
+    std::cout << "Running on "
              << device.get_info<cl::sycl::info::device::name>() << std::endl;
    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
 #endif
    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_scan_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp
@ -0,0 +1,141 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
                      IndexType k_size, IndexType n_size, int consume_dim,
                      bool exclusive) {
  static const DataType error_threshold = 1e-4f;
  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
            << " consume_dim : " << consume_dim << ")" << std::endl;
  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
                                                          n_size);
  t_input.setRandom();
  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
  DataType* gpu_data_in =
      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
  DataType* gpu_data_out =
      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
      gpu_data_in, tensorRange);
  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
      gpu_data_out, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
  t_result = t_input.cumsum(consume_dim, exclusive);
  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
                                 t_result_bytes);
  sycl_device.synchronize();
  for (IndexType i = 0; i < t_result.size(); i++) {
    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
            t_result(i) - t_result_gpu(i)))) < error_threshold) {
      continue;
    }
    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
                                  error_threshold)) {
      continue;
    }
    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
              << " vs SYCL : " << t_result_gpu(i) << std::endl;
    assert(false);
  }
  sycl_device.deallocate(gpu_data_in);
  sycl_device.deallocate(gpu_data_out);
 }
 template <typename DataType, typename Dev>
 void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
                                                true);
  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
                                                true);
 }
 template <typename DataType, typename Dev>
 void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
                                                true);
  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
                                                true);
 }
 template <typename DataType, typename Dev>
 void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
                                                true);
  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
                                                true);
 }
 template <typename DataType, typename Dev>
 void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
                                                false);
  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
                                                false);
 }
 template <typename DataType, typename Dev>
 void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
                                                false);
  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
                                                false);
 }
 template <typename DataType, typename Dev>
 void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
                                                false);
  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
                                                false);
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
  for (const auto& device : Eigen::get_sycl_supported_devices()) {
    std::cout << "Running on "
              << device.template get_info<cl::sycl::info::device::name>()
              << std::endl;
    QueueInterface queueInterface(device);
    auto sycl_device = Eigen::SyclDevice(&queueInterface);
    CALL_SUBTEST_1(
        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
    CALL_SUBTEST_2(
        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
    CALL_SUBTEST_3(
        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
    CALL_SUBTEST_4(
        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
    CALL_SUBTEST_5(
        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
    CALL_SUBTEST_6(
        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
  }
 }
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@ -12,14 +12,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
@ -29,33 +27,33 @@ using Eigen::Tensor;
 using Eigen::TensorMap;
 template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
 {
  IndexType sizeDim1 = 2;
  IndexType sizeDim2 = 3;
  IndexType sizeDim3 = 5;
  IndexType sizeDim4 = 7;
  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
  tensor.setRandom();
-  const size_t buffSize =tensor.size()*sizeof(DataType);
+  const size_t buffSize = tensor.size() * sizeof(DataType);
  array<IndexType, 4> shuffles;
  shuffles[0] = 0;
  shuffles[1] = 1;
  shuffles[2] = 2;
  shuffles[3] = 3;
-  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
-
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+                                                             tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
                                                             tensorRange);
  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
-  gpu2.device(sycl_device)=gpu1.shuffle(shuffles);
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
  sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
  sycl_device.synchronize();
@ -68,7 +66,7 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
    for (IndexType j = 0; j < sizeDim2; ++j) {
      for (IndexType k = 0; k < sizeDim3; ++k) {
        for (IndexType l = 0; l < sizeDim4; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
        }
      }
    }
@ -78,12 +76,14 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  shuffles[1] = 3;
  shuffles[2] = 1;
  shuffles[3] = 0;
-  array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  array<IndexType, 4> tensorrangeShuffle = {
-  Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
-  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
-  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
      gpu_data3, tensorrangeShuffle);
-  gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
  sycl_device.synchronize();
@ -96,24 +96,22 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
    for (IndexType j = 0; j < sizeDim2; ++j) {
      for (IndexType k = 0; k < sizeDim3; ++k) {
        for (IndexType l = 0; l < sizeDim4; ++l) {
-          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
        }
      }
    }
  }
 }
-
+template <typename DataType, typename dev_Selector>
-template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
+void sycl_shuffling_test_per_device(dev_Selector s) {
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl)
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
-{
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@ -29,9 +29,9 @@ using Eigen::TensorMap;
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
-  IndexType sizeDim1 = 100;
+  IndexType sizeDim1 = 5;
-  IndexType sizeDim2 = 10;
+  IndexType sizeDim2 = 5;
-  IndexType sizeDim3 = 20;
+  IndexType sizeDim3 = 1;
  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
@ -56,6 +56,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
  sycl_device.synchronize();
  for (IndexType i = 0; i < in1.size(); ++i) {
  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
@ -93,6 +94,88 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
  sycl_device.deallocate(gpu_data);
 }
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
  IndexType full_size = 32;
  IndexType half_size = full_size / 2;
  array<IndexType, 1> tensorRange = {{full_size}};
  tensor_type in1(tensorRange);
  tensor_type out(tensorRange);
  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
  in1 = in1.random();
  // Copy all data to device, then permute on copy back to host
  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
  for (IndexType i = 0; i < half_size; ++i) {
    VERIFY_IS_APPROX(out(i), in1(i + half_size));
    VERIFY_IS_APPROX(out(i + half_size), in1(i));
  }
  in1 = in1.random();
  out.setZero();
  // Permute copies to device, then copy all back to host
  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
  for (IndexType i = 0; i < half_size; ++i) {
    VERIFY_IS_APPROX(out(i), in1(i + half_size));
    VERIFY_IS_APPROX(out(i + half_size), in1(i));
  }
  in1 = in1.random();
  out.setZero();
  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
  // Copy all to device, permute copies on device, then copy all back to host
  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
  for (IndexType i = 0; i < half_size; ++i) {
    VERIFY_IS_APPROX(out(i), in1(i + half_size));
    VERIFY_IS_APPROX(out(i + half_size), in1(i));
  }
  sycl_device.deallocate(gpu_data_out);
  sycl_device.deallocate(gpu_data);
 }
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
  IndexType full_size = 32;
  IndexType half_size = full_size / 2;
  array<IndexType, 1> tensorRange = {{full_size}};
  tensor_type cpu_out(tensorRange);
  tensor_type out(tensorRange);
  cpu_out.setZero();
  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
  for (IndexType i = 0; i < full_size; ++i) {
    VERIFY_IS_APPROX(out(i), cpu_out(i));
  }
  sycl_device.deallocate(gpu_data);
 }
 template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
@ -262,6 +345,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
		`@ -1,2 +0,0 @@`
			`#include "tensor_benchmarks_sycl.cc"`
			`#include "tensor_benchmarks_sycl.sycl"`