[SYCL-2020] Enabling USM support for SYCL. SYCL-1.2.1 did not have support for USM.

2025-10-18 02:51:30 +08:00 · 2023-05-05 17:30:36 +00:00 · 2023-05-05 17:30:36 +00:00 · 0623791930
commit 0623791930
parent 1698c367a0
40 changed files with 164 additions and 1707 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -249,7 +249,6 @@ using std::ptrdiff_t;
 #endif

 #if defined(EIGEN_USE_SYCL)
-  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
  #include "src/Core/arch/SYCL/InteropHeaders.h"
 #if !defined(EIGEN_DONT_VECTORIZE_SYCL)
  #include "src/Core/arch/SYCL/PacketMath.h"
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@ -33,13 +33,9 @@ namespace internal {
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                   \
      pload##AlignedType<packet_type>(                                \
          const typename unpacket_traits<packet_type>::type* from) {  \
-    using scalar = typename unpacket_traits<packet_type>::type;       \
-    typedef cl::sycl::multi_ptr<                                      \
-        const scalar, cl::sycl::access::address_space::generic_space, \
-        cl::sycl::access::decorated::no>                              \
-        multi_ptr;                                                    \
+   auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(from);\
    packet_type res{};                                                \
-    res.load(0, multi_ptr(from));                                     \
+    res.load(0, ptr);                                     \
    return res;                                                       \
  }

@ -54,11 +50,8 @@ SYCL_PLOAD(cl::sycl::cl_double2, )
  template <>                                                   \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
      scalar* to, const packet_type& from) {                    \
-    typedef cl::sycl::multi_ptr<                                \
-        scalar, cl::sycl::access::address_space::generic_space, \
-        cl::sycl::access::decorated::no>                        \
-        multi_ptr;                                              \
-    from.store(0, multi_ptr(to));                               \
+    auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(to);\
+    from.store(0, ptr);                               \
  }

 SYCL_PSTORE(float, cl::sycl::cl_float4, )
@ -370,60 +363,6 @@ inline cl::sycl::cl_double2 pblend(
 }
 #endif  // SYCL_DEVICE_ONLY

-template <typename packet_type, int Alignment, typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
-          cl::sycl::access::mode::read_write, T>& from) {
-  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
-}
-
-#define SYCL_PLOAD(Alignment, AlignedType)                              \
-  template <typename packet_type>                                       \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
-      const Eigen::TensorSycl::internal::RangeAccess<                   \
-          cl::sycl::access::mode::read_write,                           \
-          typename unpacket_traits<packet_type>::type>                  \
-          from) {                                                       \
-    return ploadt_ro<packet_type, Alignment>(from);                     \
-  }
-SYCL_PLOAD(Unaligned, u)
-SYCL_PLOAD(Aligned, )
-#undef SYCL_PLOAD
-
-template <typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt(const Eigen::TensorSycl::internal::RangeAccess<
-       cl::sycl::access::mode::read_write,
-       typename unpacket_traits<packet_type>::type>& from) {
-  return ploadt<packet_type, Alignment>(from.get_pointer());
-}
-
-#define SYCL_PSTORE(alignment)                                  \
-  template <typename packet_type>                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
-      const Eigen::TensorSycl::internal::RangeAccess<           \
-          cl::sycl::access::mode::read_write,                   \
-          typename unpacket_traits<packet_type>::type>& to,     \
-      const packet_type& from) {                                \
-    pstore##alignment(to.get_pointer(), from);                  \
-  }
-
-// global space
-SYCL_PSTORE()
-SYCL_PSTORE(u)
-
-#undef SYCL_PSTORE
-
-template <typename scalar, typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(
-    Eigen::TensorSycl::internal::RangeAccess<
-        cl::sycl::access::mode::read_write,
-        typename unpacket_traits<packet_type>::type>
-        to,
-    const packet_type& from) {
-  pstoret<scalar, packet_type, Alignment>(to.get_pointer(), from);
-}
-
 }  // end namespace internal

 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
+++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
@ -1,700 +0,0 @@
-/***************************************************************************
- *  Copyright (C) 2017 Codeplay Software Limited
- *  This Source Code Form is subject to the terms of the Mozilla
- *  Public License v. 2.0. If a copy of the MPL was not distributed
- *  with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- *
- *  SyclMemoryModel.h
- *
- *  Description:
- *    Interface for SYCL buffers to behave as a non-dereferenceable pointer
- *    Interface for Placeholder accessor to behave as a pointer on both host
- *    and device
- *
- * Authors:
- *
- *    Ruyman Reyes   Codeplay Software Ltd.
- *    Mehdi Goli     Codeplay Software Ltd.
- *    Vanya Yaneva   Codeplay Software Ltd.
- *
- **************************************************************************/
-
-#if defined(EIGEN_USE_SYCL) && \
-    !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
-
-#include <CL/sycl.hpp>
-#ifdef EIGEN_EXCEPTIONS
-#include <stdexcept>
-#endif
-#include <cstddef>
-#include <queue>
-#include <set>
-#include <unordered_map>
-
-#include "../../InternalHeaderCheck.h"
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-using sycl_acc_target = cl::sycl::access::target;
-using sycl_acc_mode = cl::sycl::access::mode;
-
-/**
- * Default values for template arguments
- */
-using buffer_data_type_t = uint8_t;
-const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer;
-const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write;
-
-/**
- * PointerMapper
- *  Associates fake pointers with buffers.
- *
- */
-class PointerMapper {
- public:
-  using base_ptr_t = std::intptr_t;
-
-  /* Structure of a virtual pointer
-   *
-   * |================================================|
-   * |               POINTER ADDRESS                  |
-   * |================================================|
-   */
-  struct virtual_pointer_t {
-    /* Type for the pointers
-     */
-    base_ptr_t m_contents;
-
-    /** Conversions from virtual_pointer_t to
-     * void * should just reinterpret_cast the integer number
-     */
-    operator void *() const { return reinterpret_cast<void *>(m_contents); }
-
-    /**
-     * Convert back to the integer number.
-     */
-    operator base_ptr_t() const { return m_contents; }
-
-    /**
-     * Add a certain value to the pointer to create a
-     * new pointer to that offset
-     */
-    virtual_pointer_t operator+(size_t off) { return m_contents + off; }
-
-    /* Numerical order for sorting pointers in containers. */
-    bool operator<(virtual_pointer_t rhs) const {
-      return (static_cast<base_ptr_t>(m_contents) <
-              static_cast<base_ptr_t>(rhs.m_contents));
-    }
-
-    bool operator>(virtual_pointer_t rhs) const {
-      return (static_cast<base_ptr_t>(m_contents) >
-              static_cast<base_ptr_t>(rhs.m_contents));
-    }
-
-    /**
-     * Numerical order for sorting pointers in containers
-     */
-    bool operator==(virtual_pointer_t rhs) const {
-      return (static_cast<base_ptr_t>(m_contents) ==
-              static_cast<base_ptr_t>(rhs.m_contents));
-    }
-
-    /**
-     * Simple forward to the equality overload.
-     */
-    bool operator!=(virtual_pointer_t rhs) const {
-      return !(this->operator==(rhs));
-    }
-
-    /**
-     * Converts a void * into a virtual pointer structure.
-     * Note that this will only work if the void * was
-     * already a virtual_pointer_t, but we have no way of
-     * checking
-     */
-    virtual_pointer_t(const void *ptr)
-        : m_contents(reinterpret_cast<base_ptr_t>(ptr)){};
-
-    /**
-     * Creates a virtual_pointer_t from the given integer
-     * number
-     */
-    virtual_pointer_t(base_ptr_t u) : m_contents(u){};
-  };
-
-  /* Definition of a null pointer
-   */
-  const virtual_pointer_t null_virtual_ptr = nullptr;
-
-  /**
-   * Whether if a pointer is null or not.
-   * A pointer is nullptr if the value is of null_virtual_ptr
-   */
-  static inline bool is_nullptr(virtual_pointer_t ptr) {
-    return (static_cast<void *>(ptr) == nullptr);
-  }
-
-  /* basic type for all buffers
-   */
-  using buffer_t = cl::sycl::buffer<buffer_data_type_t>;
-
-  /**
-   * Node that stores information about a device allocation.
-   * Nodes are sorted by size to organise a free list of nodes
-   * that can be recovered.
-   */
-  struct pMapNode_t {
-    buffer_t m_buffer;
-    size_t m_size;
-    bool m_free;
-
-    pMapNode_t(buffer_t b, size_t size, bool f)
-        : m_buffer{b}, m_size{size}, m_free{f} {
-      m_buffer.set_final_data(nullptr);
-    }
-
-    bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); }
-  };
-
-  /** Storage of the pointer / buffer tree
-   */
-  using pointerMap_t = std::map<virtual_pointer_t, pMapNode_t>;
-
-  /**
-   * Obtain the insertion point in the pointer map for
-   * a pointer of the given size.
-   * \param requiredSize Size attempted to reclaim
-   */
-  typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) {
-    typename pointerMap_t::iterator retVal;
-    bool reuse = false;
-    if (!m_freeList.empty()) {
-      // try to re-use an existing block
-      for (auto freeElem : m_freeList) {
-        if (freeElem->second.m_size >= requiredSize) {
-          retVal = freeElem;
-          reuse = true;
-          // Element is not going to be free anymore
-          m_freeList.erase(freeElem);
-          break;
-        }
-      }
-    }
-    if (!reuse) {
-      retVal = std::prev(m_pointerMap.end());
-    }
-    return retVal;
-  }
-
-  /**
-   * Returns an iterator to the node that stores the information
-   * of the given virtual pointer from the given pointer map structure.
-   * If pointer is not found, throws std::out_of_range.
-   * If the pointer map structure is empty, throws std::out_of_range
-   *
-   * \param pMap the pointerMap_t structure storing all the pointers
-   * \param virtual_pointer_ptr The virtual pointer to obtain the node of
-   * \throws std::out:of_range if the pointer is not found or pMap is empty
-   */
-  typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) {
-    if (this->count() == 0) {
-      m_pointerMap.clear();
-      EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n"));
-
-    }
-    if (is_nullptr(ptr)) {
-      m_pointerMap.clear();
-      EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n"));
-    }
-    // The previous element to the lower bound is the node that
-    // holds this memory address
-    auto node = m_pointerMap.lower_bound(ptr);
-    // If the value of the pointer is not the one of the node
-    // then we return the previous one
-    if (node == std::end(m_pointerMap)) {
-      --node;
-    } else if (node->first != ptr) {
-      if (node == std::begin(m_pointerMap)) {
-        m_pointerMap.clear();
-        EIGEN_THROW_X(
-            std::out_of_range("The pointer is not registered in the map\n"));
-      }
-      --node;
-    }
-
-    return node;
-  }
-
-  /* get_buffer.
-   * Returns a buffer from the map using the pointer address
-   */
-  template <typename buffer_data_type = buffer_data_type_t>
-  cl::sycl::buffer<buffer_data_type, 1> get_buffer(
-      const virtual_pointer_t ptr) {
-
-    auto node = get_node(ptr);
-    auto& map_node = node->second;
-    eigen_assert(node->first == ptr || node->first < ptr);
-    eigen_assert(ptr < static_cast<virtual_pointer_t>(map_node.m_size +
-                                                      node->first));
-    return map_node.m_buffer.reinterpret<buffer_data_type>(
-        cl::sycl::range<1>{map_node.m_size / sizeof(buffer_data_type)});
-  }
-
-  /**
-   * @brief Returns an accessor to the buffer of the given virtual pointer
-   * @param accessMode
-   * @param accessTarget
-   * @param ptr The virtual pointer
-   */
-  template <sycl_acc_mode access_mode = default_acc_mode,
-            sycl_acc_target access_target = default_acc_target,
-            typename buffer_data_type = buffer_data_type_t>
-  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
-  get_access(const virtual_pointer_t ptr) {
-    auto buf = get_buffer<buffer_data_type>(ptr);
-    return buf.template get_access<access_mode, access_target>();
-  }
-
-  /**
-   * @brief Returns an accessor to the buffer of the given virtual pointer
-   *        in the given command group scope
-   * @param accessMode
-   * @param accessTarget
-   * @param ptr The virtual pointer
-   * @param cgh Reference to the command group scope
-   */
-  template <sycl_acc_mode access_mode = default_acc_mode,
-            sycl_acc_target access_target = default_acc_target,
-            typename buffer_data_type = buffer_data_type_t>
-  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
-  get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) {
-    auto buf = get_buffer<buffer_data_type>(ptr);
-    return buf.template get_access<access_mode, access_target>(cgh);
-  }
-
-  /*
-   * Returns the offset from the base address of this pointer.
-   */
-  inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) {
-    // The previous element to the lower bound is the node that
-    // holds this memory address
-    auto node = get_node(ptr);
-    auto start = node->first;
-    eigen_assert(start == ptr || start < ptr);
-    eigen_assert(ptr < start + node->second.m_size);
-    return (ptr - start);
-  }
-
-  /*
-   * Returns the number of elements by which the given pointer is offset from
-   * the base address.
-   */
-  template <typename buffer_data_type>
-  inline size_t get_element_offset(const virtual_pointer_t ptr) {
-    return get_offset(ptr) / sizeof(buffer_data_type);
-  }
-
-  /**
-   * Constructs the PointerMapper structure.
-   */
-  PointerMapper(base_ptr_t baseAddress = 4096)
-      : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} {
-    if (m_baseAddress == 0) {
-      EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n"));
-    }
-  };
-
-  /**
-   * PointerMapper cannot be copied or moved
-   */
-  PointerMapper(const PointerMapper &) = delete;
-
-  /**
-   * Empty the pointer list
-   */
-  inline void clear() {
-    m_freeList.clear();
-    m_pointerMap.clear();
-  }
-
-  /* add_pointer.
-   * Adds an existing pointer to the map and returns the virtual pointer id.
-   */
-  inline virtual_pointer_t add_pointer(const buffer_t &b) {
-    return add_pointer_impl(b);
-  }
-
-  /* add_pointer.
-   * Adds a pointer to the map and returns the virtual pointer id.
-   */
-  inline virtual_pointer_t add_pointer(buffer_t &&b) {
-    return add_pointer_impl(b);
-  }
-
-  /**
-   * @brief Fuses the given node with the previous nodes in the
-   *        pointer map if they are free
-   *
-   * @param node A reference to the free node to be fused
-   */
-  void fuse_forward(typename pointerMap_t::iterator &node) {
-    while (node != std::prev(m_pointerMap.end())) {
-      // if following node is free
-      // remove it and extend the current node with its size
-      auto fwd_node = std::next(node);
-      if (!fwd_node->second.m_free) {
-        break;
-      }
-      auto fwd_size = fwd_node->second.m_size;
-      m_freeList.erase(fwd_node);
-      m_pointerMap.erase(fwd_node);
-
-      node->second.m_size += fwd_size;
-    }
-  }
-
-  /**
-   * @brief Fuses the given node with the following nodes in the
-   *        pointer map if they are free
-   *
-   * @param node A reference to the free node to be fused
-   */
-  void fuse_backward(typename pointerMap_t::iterator &node) {
-    while (node != m_pointerMap.begin()) {
-      // if previous node is free, extend it
-      // with the size of the current one
-      auto prev_node = std::prev(node);
-      if (!prev_node->second.m_free) {
-        break;
-      }
-      prev_node->second.m_size += node->second.m_size;
-
-      // remove the current node
-      m_freeList.erase(node);
-      m_pointerMap.erase(node);
-
-      // point to the previous node
-      node = prev_node;
-    }
-  }
-
-  /* remove_pointer.
-   * Removes the given pointer from the map.
-   * The pointer is allowed to be reused only if ReUse if true.
-   */
-  template <bool ReUse = true>
-  void remove_pointer(const virtual_pointer_t ptr) {
-    if (is_nullptr(ptr)) {
-      return;
-    }
-    auto node = this->get_node(ptr);
-
-    node->second.m_free = true;
-    m_freeList.emplace(node);
-
-    // Fuse the node
-    // with free nodes before and after it
-    fuse_forward(node);
-    fuse_backward(node);
-
-    // If after fusing the node is the last one
-    // simply remove it (since it is free)
-    if (node == std::prev(m_pointerMap.end())) {
-      m_freeList.erase(node);
-      m_pointerMap.erase(node);
-    }
-  }
-
-  /* count.
-   * Return the number of active pointers (i.e, pointers that
-   * have been malloc but not freed).
-   */
-  size_t count() const { return (m_pointerMap.size() - m_freeList.size()); }
-
- private:
-  /* add_pointer_impl.
-   * Adds a pointer to the map and returns the virtual pointer id.
-   * BufferT is either a const buffer_t& or a buffer_t&&.
-   */
-  template <class BufferT>
-  virtual_pointer_t add_pointer_impl(BufferT b) {
-    virtual_pointer_t retVal = nullptr;
-    size_t bufSize = b.get_count() * sizeof(buffer_data_type_t);
-    auto byte_buffer =
-        b.template reinterpret<buffer_data_type_t>(cl::sycl::range<1>{bufSize});
-    pMapNode_t p{byte_buffer, bufSize, false};
-
-    // If this is the first pointer:
-    if (m_pointerMap.empty()) {
-      virtual_pointer_t initialVal{m_baseAddress};
-      m_pointerMap.emplace(initialVal, p);
-      return initialVal;
-    }
-
-    auto lastElemIter = get_insertion_point(bufSize);
-    // We are recovering an existing free node
-    if (lastElemIter->second.m_free) {
-      lastElemIter->second.m_buffer = b;
-      lastElemIter->second.m_free = false;
-
-      // If the recovered node is bigger than the inserted one
-      // add a new free node with the remaining space
-      if (lastElemIter->second.m_size > bufSize) {
-        // create a new node with the remaining space
-        auto remainingSize = lastElemIter->second.m_size - bufSize;
-        pMapNode_t p2{b, remainingSize, true};
-
-        // update size of the current node
-        lastElemIter->second.m_size = bufSize;
-
-        // add the new free node
-        auto newFreePtr = lastElemIter->first + bufSize;
-        auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first;
-        m_freeList.emplace(freeNode);
-      }
-
-      retVal = lastElemIter->first;
-    } else {
-      size_t lastSize = lastElemIter->second.m_size;
-      retVal = lastElemIter->first + lastSize;
-      m_pointerMap.emplace(retVal, p);
-    }
-    return retVal;
-  }
-
-  /**
-   * Compare two iterators to pointer map entries according to
-   * the size of the allocation on the device.
-   */
-  struct SortBySize {
-    bool operator()(typename pointerMap_t::iterator a,
-                    typename pointerMap_t::iterator b) const {
-      return ((a->first < b->first) && (a->second <= b->second)) ||
-             ((a->first < b->first) && (b->second <= a->second));
-    }
-  };
-
-  /* Maps the pointer addresses to buffer and size pairs.
-   */
-  pointerMap_t m_pointerMap;
-
-  /* List of free nodes available for re-using
-   */
-  std::set<typename pointerMap_t::iterator, SortBySize> m_freeList;
-
-  /* Base address used when issuing the first virtual pointer, allows users
-   * to specify alignment. Cannot be zero. */
-  std::intptr_t m_baseAddress;
-};
-
-/* remove_pointer.
- * Removes the given pointer from the map.
- * The pointer is allowed to be reused only if ReUse if true.
- */
-template <>
-inline void PointerMapper::remove_pointer<false>(const virtual_pointer_t ptr) {
-  if (is_nullptr(ptr)) {
-    return;
-  }
-  m_pointerMap.erase(this->get_node(ptr));
-}
-
-/**
- * Malloc-like interface to the pointer-mapper.
- * Given a size, creates a byte-typed buffer and returns a
- * fake pointer to keep track of it.
- * \param size Size in bytes of the desired allocation
- * \throw cl::sycl::exception if error while creating the buffer
- */
-inline void *SYCLmalloc(size_t size, PointerMapper &pMap) {
-  if (size == 0) {
-    return nullptr;
-  }
-  // Create a generic buffer of the given size
-  using buffer_t = cl::sycl::buffer<buffer_data_type_t, 1>;
-  auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size}));
-  // Store the buffer on the global list
-  return static_cast<void *>(thePointer);
-}
-
-/**
- * Free-like interface to the pointer mapper.
- * Given a fake-pointer created with the virtual-pointer malloc,
- * destroys the buffer and remove it from the list.
- * If ReUse is false, the pointer is not added to the freeList,
- * it should be false only for sub-buffers.
- */
-template <bool ReUse = true, typename PointerMapper>
-inline void SYCLfree(void *ptr, PointerMapper &pMap) {
-  pMap.template remove_pointer<ReUse>(ptr);
-}
-
-/**
- * Clear all the memory allocated by SYCL.
- */
-template <typename PointerMapper>
-inline void SYCLfreeAll(PointerMapper &pMap) {
-  pMap.clear();
-}
-
-template <cl::sycl::access::mode AcMd, typename T>
-struct RangeAccess {
-  static const auto global_access = cl::sycl::access::target::global_buffer;
-  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
-  typedef T scalar_t;
-  typedef scalar_t &ref_t;
-  typedef scalar_t *ptr_t;
-
-  // the accessor type does not necessarily the same as T
-  typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
-      accessor;
-
-  typedef RangeAccess<AcMd, T> self_t;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access,
-                                                    size_t offset,
-                                                    std::intptr_t virtual_ptr)
-      : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {}
-
-  RangeAccess(cl::sycl::buffer<scalar_t, 1> buff =
-                  cl::sycl::buffer<scalar_t, 1>(cl::sycl::range<1>(1)))
-      : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {}
-
-  // This should be only used for null constructor on the host side
-  RangeAccess(std::nullptr_t) : RangeAccess() {}
-  // This template parameter must be removed and scalar_t should be replaced
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
-    typedef cl::sycl::multi_ptr<scalar_t,
-                                cl::sycl::access::address_space::generic_space,
-                                cl::sycl::access::decorated::no>
-        multi_ptr;
-    multi_ptr p(access_);
-    return (p + offset_).get_raw();
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
-    offset_ += (offset);
-    return *this;
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const {
-    return self_t(access_, offset_ + offset, virtual_ptr_);
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const {
-    return self_t(access_, offset_ - offset, virtual_ptr_);
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) {
-    offset_ -= offset;
-    return *this;
-  }
-
-  // THIS IS FOR NULL COMPARISON ONLY
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
-      const RangeAccess &lhs, std::nullptr_t) {
-    return ((lhs.virtual_ptr_ == -1));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
-      const RangeAccess &lhs, std::nullptr_t i) {
-    return !(lhs == i);
-  }
-
-  // THIS IS FOR NULL COMPARISON ONLY
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
-      std::nullptr_t, const RangeAccess &rhs) {
-    return ((rhs.virtual_ptr_ == -1));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
-      std::nullptr_t i, const RangeAccess &rhs) {
-    return !(i == rhs);
-  }
-  // Prefix operator (Increment and return value)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() {
-    offset_++;
-    return (*this);
-  }
-
-  // Postfix operator (Return value and increment)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) {
-    EIGEN_UNUSED_VARIABLE(i);
-    self_t temp_iterator(*this);
-    offset_++;
-    return temp_iterator;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const {
-    return (access_.get_count() - offset_);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const {
-    return offset_;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) {
-    offset_ = offset;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const {
-    return *get_pointer();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() {
-    return *get_pointer();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) {
-    return *(get_pointer() + x);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const {
-    return *(get_pointer() + x);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const {
-    return reinterpret_cast<scalar_t *>(virtual_ptr_ +
-                                        (offset_ * sizeof(scalar_t)));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const {
-    return (virtual_ptr_ != -1);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess<AcMd, const T>() {
-    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  operator RangeAccess<AcMd, const T>() const {
-    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
-  }
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
-      cl::sycl::handler &cgh) const {
-    cgh.require(access_);
-  }
-
- private:
-  accessor access_;
-  size_t offset_;
-  std::intptr_t virtual_ptr_;  // the location of the buffer in the map
-};
-
-template <cl::sycl::access::mode AcMd, typename T>
-struct RangeAccess<AcMd, const T> : RangeAccess<AcMd, T> {
-  typedef RangeAccess<AcMd, T> Base;
-  using Base::Base;
-};
-
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -531,12 +531,12 @@
  #error "NVCC as the target platform for HIPCC is currently not supported."
 #endif

-#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
  // Means the compiler is either nvcc or clang with CUDA enabled
  #define EIGEN_CUDACC __CUDACC__
 #endif

-#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
  // Means we are generating code for the device
  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
 #endif
@ -548,7 +548,7 @@
  #define EIGEN_CUDA_SDK_VER 0
 #endif

-#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
  #define EIGEN_HIPCC __HIPCC__

@ -557,7 +557,7 @@
  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
  #include <hip/hip_runtime.h>

-  #if defined(__HIP_DEVICE_COMPILE__)
+  #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
    // analogous to EIGEN_CUDA_ARCH, but for HIP
    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
  #endif
--- a/test/OffByOneScalar.h
+++ b/test/OffByOneScalar.h
@ -2,17 +2,13 @@
 // A Scalar with internal representation T+1 so that zero is internally
 // represented by T(1). This is used to test memory fill.
 // 
+#pragma once
 template<typename T>
 class OffByOneScalar {
 public:
  OffByOneScalar() : val_(1) {}
-  OffByOneScalar(const OffByOneScalar& other) {
-    *this = other;
-  }
-  OffByOneScalar& operator=(const OffByOneScalar& other) {
-    val_ = other.val_;
-    return *this;
-  }
+  OffByOneScalar(const OffByOneScalar& other) = default;
+  OffByOneScalar& operator=(const OffByOneScalar& other) = default;
  
  OffByOneScalar(T val) : val_(val + 1) {}
  OffByOneScalar& operator=(T val) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@ -128,12 +128,6 @@ struct TensorEvaluator<const TensorIndexPairOp<ArgType>, Device>

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
 };
@ -278,12 +272,6 @@ struct TensorEvaluator<const TensorPairReducerOp<ReduceOp, Dims, ArgType>, Devic
  }

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_orig_impl.bind(cgh);
-  }
-#endif

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
  costPerCoeff(bool vectorized) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -228,14 +228,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    block.cleanup();
  }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-#endif
-
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }

 private:
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -687,13 +687,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }

  Broadcast functor() const { return m_broadcast; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
-      cl::sycl::handler& cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
 private:
  static constexpr bool IsColMajor =
      static_cast<int>(Layout) == static_cast<int>(ColMajor);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -355,12 +355,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
      return NULL;
    }
  }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif

 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -279,14 +279,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-  #endif
-
  protected:
    Dimensions m_dimensions;
    array<Index, NumDims> m_outputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -61,13 +61,6 @@ struct CoeffLoader {
    return m_tensor.template packet<LoadMode>(index);
  }

-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_tensor.bind(cgh);
-  }
-  #endif
-
 private:
  const Tensor m_tensor;
 };
@ -97,12 +90,6 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
  }

-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-  #endif
 private:
  typedef typename Tensor::Scalar Scalar;

@ -250,13 +237,6 @@ class SimpleTensorContractionMapper {
    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
  }

-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_tensor.bind(cgh);
-  }
-  #endif
-
  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
    return m_tensor;
  }
@ -508,13 +488,6 @@ class TensorContractionSubMapper {
    return false;
  }

-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_base_mapper.bind(cgh);
-  }
-  #endif
-
  const ParentMapper& base_mapper() const { return m_base_mapper; }
  Index vert_offset() const { return m_vert_offset; }
  Index horiz_offset() const { return m_horiz_offset; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@ -627,7 +627,7 @@ class TensorContractionKernel {
        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);

-    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+    auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);

    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
@ -1048,7 +1048,7 @@ struct GeneralVectorTensor {
        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
    const StorageIndex contractGroupId =
        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
-    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+    auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);

    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
@ -1255,8 +1255,8 @@ struct GeneralScalarContraction {

  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {

-    auto out_ptr = out_res.get_pointer();
-    auto scratch_ptr = scratch.get_pointer().get();
+    auto out_ptr = out_res;
+    OutScalar * scratch_ptr = scratch.get_pointer();

    StorageIndex globalid = itemID.get_global_id(0);
    StorageIndex localid = itemID.get_local_id(0);
@ -1395,13 +1395,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
    typedef internal::TensorContractionInputMapper<
        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
-        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
        LhsMapper;

    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
                                                   right_nocontract_t, contract_t,
                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+                                                   rhs_inner_dim_reordered, Unaligned, MakePointer>
        RhsMapper;

    // initialize data mappers
@ -1505,8 +1505,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
                                                            PacketAccess, input_mapper_properties, true, ct>
          ContractKernelName;
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+       device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim).wait();
    } else {
      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
@ -1518,7 +1518,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT

      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
-          triple_dim);
+          triple_dim).wait();

      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
      auto op = Op();
@ -1531,8 +1531,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
                                cl::sycl::range<1>(localRange)),
-          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
-
+          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK).wait();
      device().deallocate_temp(temp_pointer);
    }
  }
@ -1566,28 +1565,28 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);

-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+     device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();

      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
                                                               EvaluatorPointerType, Op>
          ReductionKernel;

-      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+     device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
          tmp_global_accessor, buffer,
          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
                                cl::sycl::range<1>(localRange)),
-          StorageIndex(1), Op(), nonContractDim, cNumGroups);
-
+          StorageIndex(1), Op(), nonContractDim, cNumGroups).wait();   
      device().deallocate_temp(temp_pointer);
    } else {
      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
                                                               is_lhs_vec, true>
          ContractKernelName;
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+     device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
+   
    }
  }
 #endif
@ -1616,19 +1615,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
-                                                                                    thread_range, local_range, K);
+                                                                                    thread_range, local_range, K).wait();
      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
                                                          EvaluatorPointerType, StorageIndex, local_range>
          GenericRKernel;
      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
-
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()).wait();
      device().deallocate_temp(temp_pointer);
    } else {
      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
-                                                                                    local_range, K);
+                                                                                    local_range, K).wait();
    }
  }
 #endif
@ -1642,12 +1640,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      this->m_result = NULL;
    }
  }
-  // The placeholder accessors must bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    this->m_leftImpl.bind(cgh);
-    this->m_rightImpl.bind(cgh);
-    this->m_result.bind(cgh);
-  }
 };
 }  // namespace Eigen
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -442,12 +442,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>

  /// required by sycl in order to extract the sycl accessor
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif

 protected:
  TensorEvaluator<ArgType, Device> m_impl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -61,8 +61,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
    return (boolean_check[0] && boolean_check[1]);
  }
  void operator()(cl::sycl::nd_item<2> itemID)  const {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
    // the required row to be calculated for the for each plane in shered memory
    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
@ -128,8 +128,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
  }

  void operator()(cl::sycl::nd_item<3> itemID) const {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
    // the required row to be calculated for the for each plane in shered memory
    const auto num_input = cl::sycl::range<2>{
        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
@ -216,8 +216,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
  }
  void operator()(cl::sycl::nd_item<3> itemID) const {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};

    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
@ -411,7 +411,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr

        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])).wait();
        break;
      }

@ -442,7 +442,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
            ConvKernel;
        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}).wait();
        break;
      }

@ -482,7 +482,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
            ConvKernel;
        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP).wait();
        break;
      }

@ -519,13 +519,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
  }
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_kernelImpl.bind(cgh);
-    m_inputImpl.bind(cgh);
-    m_buf.bind(cgh);
-    m_kernel.bind(cgh);
-  }
+

 private:
  // No assignment (copies are needed by the kernels)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -151,13 +151,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_result.bind(cgh);
-  }
-#endif
-
 protected:
  void evalTo(EvaluatorPointerType data) {
    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
@ -324,12 +317,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_result.bind(cgh);
-  }
-#endif

 protected:
  void evalTo(EvaluatorPointerType data) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -31,7 +31,8 @@ struct SyclDeviceInfo {
                .template get_info<cl::sycl::info::device::local_mem_type>()),
        max_work_item_sizes(
            queue.get_device()
-                .template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
+                .template get_info<
+                    cl::sycl::info::device::max_work_item_sizes<3>>()),
        max_mem_alloc_size(
            queue.get_device()
                .template get_info<
@ -69,7 +70,6 @@ struct SyclDeviceInfo {
 }  // end namespace internal
 }  // end namespace TensorSycl

-typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
 // TensorFlow via the Eigen SYCL Backend.
@ -110,147 +110,52 @@ class QueueInterface {
  explicit QueueInterface(
      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
      unsigned num_threads = std::thread::hardware_concurrency())
-      : m_queue(dev_or_sel, handler),
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
-#endif
+      : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
        m_thread_pool(num_threads),
-        m_device_info(m_queue) {
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-    m_prog.build_with_kernel_type<DeviceOrSelector>();
-    auto f = [&](cl::sycl::handler &cgh) {
-      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
-                                        [=]() {})
-    };
-    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
-#endif
-  }
+        m_device_info(m_queue) {}

  template <typename DeviceOrSelector>
  explicit QueueInterface(
      const DeviceOrSelector &dev_or_sel,
      unsigned num_threads = std::thread::hardware_concurrency())
-      : QueueInterface(dev_or_sel,
-                       [this](cl::sycl::exception_list l) {
-                         this->exception_caught_ = this->sycl_async_handler(l);
-                       },
-                       num_threads) {}
-  
+      : QueueInterface(
+            dev_or_sel,
+            [this](cl::sycl::exception_list l) {
+              this->exception_caught_ = this->sycl_async_handler(l);
+            },
+            num_threads) {}
+
  explicit QueueInterface(
-      const cl::sycl::queue& q, unsigned num_threads = std::thread::hardware_concurrency())
-      : m_queue(q),
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
-#endif
-        m_thread_pool(num_threads),
-        m_device_info(m_queue) {}
+      const cl::sycl::queue &q,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}

-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
-#endif
-
-  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return static_cast<void *>(pMapper.add_pointer(buf));
-  }
-
-  /// Detach previously attached buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfree<false>(p, pMapper);
-  }
-
-  /// Allocating device pointer. This pointer is actually an 8 bytes host
-  /// pointer used as key to access the sycl device buffer. The reason is that
-  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
-  /// expressions. So we create a key pointer to be used in Eigen expression
-  /// construction. When we convert the Eigen construction into the sycl
-  /// construction we use this pointer as a key in our buffer_map and we make
-  /// sure that we dedicate only one buffer only for this pointer. The device
-  /// pointer would be deleted by calling deallocate function.
  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
 #if EIGEN_MAX_ALIGN_BYTES > 0
-    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
-    if (align > 0) {
-      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
-    }
+    return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES,
+                                                  num_bytes, m_queue);
+#else
+    return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
 #endif
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
  }

  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
-#if EIGEN_MAX_ALIGN_BYTES > 0
-    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
-    if (align > 0) {
-      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
-    }
-#endif
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    if (scratch_buffers.empty()) {
-      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-      ;
-    } else {
-      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
-        auto buff = pMapper.get_buffer(*it);
-        if (buff.get_size() >= num_bytes) {
-          auto ptr = *it;
-          scratch_buffers.erase(it);
-          return ptr;
-        } else {
-          ++it;
-        }
-      }
-      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-    }
-#else
-    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-#endif
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
-      cl::sycl::access::mode::read_write, data_t>
-  get(data_t *data) const {
-    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
-      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
-                                        data_t>
-          data) const {
-    return static_cast<data_t *>(data.get_virtual_pointer());
+    return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
  }

-  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.insert(p);
-#else
-    TensorSycl::internal::SYCLfree(p, pMapper);
-#endif
-  }
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE void deallocate_temp(
-      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
-    deallocate_temp(p.get_virtual_pointer());
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const {
+    deallocate_temp(const_cast<void *>(p));
  }

-  /// This is used to deallocate the device pointer. p is used as a key inside
-  /// the map to find the device buffer and delete it.
  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfree(p, pMapper);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfreeAll(pMapper);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.clear();
-#endif
+    cl::sycl::free(p, m_queue);
  }

  /// The memcpyHostToDevice is used to copy the data from host to device
@ -260,24 +165,7 @@ class QueueInterface {
  EIGEN_STRONG_INLINE void memcpyHostToDevice(
      void *dst, const void *src, size_t n,
      std::function<void()> callback) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
-        write_accessor;
-    if (n == 0) {
-      if (callback) callback();
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
-      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
-      auto non_deleter = [](buffer_scalar_t const *) {};
-      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
-      cgh.copy(s_ptr, dst_acc);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    auto e = m_queue.memcpy(dst, src, n);
    synchronize_and_callback(e, callback);
  }

@ -288,24 +176,11 @@ class QueueInterface {
  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
      void *dst, const void *src, size_t n,
      std::function<void()> callback) const {
-    static const auto read_mode = cl::sycl::access::mode::read;
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
-        read_accessor;
    if (n == 0) {
      if (callback) callback();
      return;
    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
-      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
-      auto non_deleter = [](buffer_scalar_t *) {};
-      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
-      cgh.copy(src_acc, s_ptr);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    auto e = m_queue.memcpy(dst, src, n);
    synchronize_and_callback(e, callback);
  }

@ -313,257 +188,88 @@ class QueueInterface {
  /// No callback is required here as both arguments are on the device
  /// and SYCL can handle the dependency.
  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
-    static const auto read_mode = cl::sycl::access::mode::read;
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
    if (n == 0) {
      return;
    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
-      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
-      cgh.copy(src_acc, dst_acc);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
+    m_queue.memcpy(dst, src, n).wait();
  }

  /// the memset function.
  /// No callback is required here as both arguments are on the device
  /// and SYCL can handle the dependency.
  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
    if (n == 0) {
      return;
    }
-    auto f = [&](cl::sycl::handler &cgh) {
-      // Get a typed range accesser to ensure we fill each byte, in case
-      // `buffer_scalar_t` is not (u)int8_t.
-      auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
-      cgh.fill(dst_acc, static_cast<uint8_t>(c));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
+    m_queue.memset(data, c, n).wait();
  }

-  template<typename T>
-  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
    if (begin == end) {
      return;
    }
-    const ptrdiff_t count = end - begin;
-    auto f = [&](cl::sycl::handler &cgh)  {
-      auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
-      cgh.fill(dst_acc, value);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory. This range
-  /// accessor will allow access to the memory from the pointer to the end of
-  /// the buffer.
-  ///
-  /// NOTE: Inside a kernel the range accessor will always be indexed from the
-  /// start of the buffer, so the offset in the accessor is only used by
-  /// methods like handler::copy and will not be available inside a kernel.
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
-  get_range_accessor(const void *ptr) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
-    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
-    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
-
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-
-    auto original_buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
-    eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
-    const ptrdiff_t typed_offset = offset / sizeof(T);
-    eigen_assert(typed_offset >= 0);
-    const auto typed_size = original_buffer.get_size() / sizeof(T);
-    auto buffer = original_buffer.template reinterpret<
-        std::remove_const_t<T>>(
-        cl::sycl::range<1>(typed_size));
-    const ptrdiff_t size = buffer.get_count() - typed_offset;
-    eigen_assert(size >= 0);
-    typedef cl::sycl::accessor<std::remove_const_t<T>,
-                               1, AcMd, global_access, is_place_holder>
-        placeholder_accessor_t;
-    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
-    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
-                                           cl::sycl::id<1>(typed_offset)),
-                    static_cast<size_t>(typed_offset),
-                    reinterpret_cast<std::intptr_t>(start_ptr));
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory with a
-  /// specified size.
-  template <cl::sycl::access::mode AcMd, typename Index>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
-                     const Index n_bytes) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    eigen_assert(n_bytes >= 0);
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    auto buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    eigen_assert(offset >= 0);
-    eigen_assert(offset + n_bytes <= buffer.get_size());
-    return buffer.template get_access<AcMd, global_access>(
-        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory with a
-  /// specified type and count.
-  template <cl::sycl::access::mode AcMd, typename T, typename Index>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      T, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
-                     const Index count) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    eigen_assert(count >= 0);
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    auto buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    eigen_assert(offset >= 0);
-
-    // Technically we should create a subbuffer for the desired range,
-    // then reinterpret that.  However, I was not able to get changes to reflect
-    // in the original buffer (only the subbuffer and reinterpretted buffer).
-    // This current implementation now has the restriction that the buffer
-    // offset and original buffer size must be a multiple of sizeof(T).
-    // Note that get_range_accessor(void*) currently has the same restriction.
-    //
-    // auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer, 
-    //     cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
-    eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
-    eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
-    const ptrdiff_t typed_offset = offset / sizeof(T);
-    const size_t typed_size = buffer.get_size() / sizeof(T);
-    auto reint = buffer.template reinterpret<
-        std::remove_const_t<T>>(
-        cl::sycl::range<1>(typed_size));
-    return reint.template get_access<AcMd, global_access>(
-        cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
-  }
-
-  /// Creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map. If found it gets the accessor from it, if
-  /// not, the function then adds an entry by creating a sycl buffer for that
-  /// particular pointer.
-  template <cl::sycl::access::mode AcMd>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_buffer(ptr)
-        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
-            cgh);
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
-      const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_buffer(ptr);
-  }
-
-  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_offset(ptr);
+    const size_t count = end - begin;
+    m_queue.fill(begin, value, count).wait();
  }

  template <typename OutScalar, typename sycl_kernel, typename Lhs,
            typename Rhs, typename OutPtr, typename Range, typename Index,
            typename... T>
-  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
-                                                  const Rhs &rhs, OutPtr outptr,
-                                                  Range thread_range,
-                                                  Index scratchSize,
-                                                  T... var) const {
+  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(
+      const Lhs &lhs, const Rhs &rhs, OutPtr outptr, Range thread_range,
+      Index scratchSize, T... var) const {
    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      lhs.bind(cgh);
-      rhs.bind(cgh);
-      outptr.bind(cgh);
      typedef cl::sycl::accessor<OutScalar, 1,
                                 cl::sycl::access::mode::read_write,
                                 cl::sycl::access::target::local>
          LocalAccessor;

      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+      cgh.parallel_for(thread_range,
+                       sycl_kernel(scratch, lhs, rhs, outptr, var...));
    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
+
+    return m_queue.submit(kernel_functor);
  }

  template <typename OutScalar, typename sycl_kernel, typename InPtr,
            typename OutPtr, typename Range, typename Index, typename... T>
-  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
-                                                 OutPtr &outptr,
-                                                 Range thread_range,
-                                                 Index scratchSize,
-                                                 T... var) const {
+  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr,
+                                                            OutPtr &outptr,
+                                                            Range thread_range,
+                                                            Index scratchSize,
+                                                            T... var) const {
    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      inptr.bind(cgh);
-      outptr.bind(cgh);
      typedef cl::sycl::accessor<OutScalar, 1,
                                 cl::sycl::access::mode::read_write,
                                 cl::sycl::access::target::local>
          LocalAccessor;

      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+      cgh.parallel_for(thread_range,
+                       sycl_kernel(scratch, inptr, outptr, var...));
    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
+    return m_queue.submit(kernel_functor);
  }

-    template <typename OutScalar, typename sycl_kernel, typename InPtr,
-           typename Range, typename Index, typename... T>
-  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
-                                                 Range thread_range,
-                                                 Index scratchSize,
-                                                 T... var) const {
+  template <typename OutScalar, typename sycl_kernel, typename InPtr,
+            typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(
+      const InPtr &inptr, Range thread_range, Index scratchSize,
+      T... var) const {
    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      inptr.bind(cgh);
      typedef cl::sycl::accessor<OutScalar, 1,
                                 cl::sycl::access::mode::read_write,
                                 cl::sycl::access::target::local>
          LocalAccessor;

      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, inptr, var...));
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }

+    return m_queue.submit(kernel_functor);
+  }

  EIGEN_STRONG_INLINE void synchronize() const {
 #ifdef EIGEN_EXCEPTIONS
@ -573,14 +279,6 @@ class QueueInterface {
 #endif
  }

-
-  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
-    set_latest_event(e);
-#ifndef EIGEN_SYCL_ASYNC_EXECUTION
-    synchronize();
-#endif
-  }
-
  template <typename Index>
  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
                                              Index &rng, Index &GRange) const {
@ -776,48 +474,9 @@ class QueueInterface {
    return !exception_caught_;
  }

-  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-    std::lock_guard<std::mutex> lock(event_mutex_);
-    return latest_events_[std::this_thread::get_id()];
-#else
-    eigen_assert(false);
-    return cl::sycl::event();
-#endif
-  }
-
-  // destructor
-  ~QueueInterface() {
-    pMapper.clear();
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.clear();
-#endif
-  }
-
-  template <typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> get_null_accessor()
-      const {
-    eigen_assert(null_buff_simulator.get_size() % sizeof(T) == 0 && "The null buffer size must be a multiple of sizeof(T)");
-    const ptrdiff_t typed_size = null_buff_simulator.get_size() / sizeof(T);
-    eigen_assert(typed_size > 0);
-    auto typed_null_buff =
-        null_buff_simulator.template reinterpret<T>(cl::sycl::range<1>(typed_size));
-    return TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>(typed_null_buff);
-  }
-
 protected:
-  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-    std::lock_guard<std::mutex> lock(event_mutex_);
-    latest_events_[std::this_thread::get_id()] = e;
-#else
-    EIGEN_UNUSED_VARIABLE(e);
-#endif
-  }
-
  void synchronize_and_callback(cl::sycl::event e,
                                const std::function<void()> &callback) const {
-    set_latest_event(e);
    if (callback) {
      auto callback_ = [=]() {
 #ifdef EIGEN_EXCEPTIONS
@ -850,29 +509,8 @@ class QueueInterface {

  /// class members:
  bool exception_caught_ = false;
-
-  mutable std::mutex pmapper_mutex_;
-
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-  mutable std::mutex event_mutex_;
-  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
-#endif
-
-  /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer now depends on the lifespan of
-  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
-  /// host we should manually deallocate it.
-  mutable TensorSycl::internal::PointerMapper pMapper;
-  cl::sycl::buffer<uint8_t, 1> null_buff_simulator = cl::sycl::buffer<uint8_t, 1>(cl::sycl::range<1>(128));
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-  mutable std::unordered_set<void *> scratch_buffers;
-#endif
  /// sycl queue
  mutable cl::sycl::queue m_queue;
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  mutable cl::sycl::program m_prog;
-#endif
-
  /// The thread pool is used to wait on events and call callbacks
  /// asynchronously
  mutable Eigen::ThreadPool m_thread_pool;
@ -896,32 +534,6 @@ struct SyclDeviceBase {
 struct SyclDevice : public SyclDeviceBase {
  explicit SyclDevice(const QueueInterface *queue_stream)
      : SyclDeviceBase(queue_stream) {}
- 
-  template <typename scalar_t>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, scalar_t>
-  get_null_accessor() const {
-    return queue_stream()->template get_null_accessor<scalar_t>();
-  }
-  // this is the accessor used to construct the evaluator
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
-  get_range_accessor(const void *ptr) const {
-    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
-  }
-
-  // get sycl accessor
-  template <cl::sycl::access::mode AcMd>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
-    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
-  }
-
-  /// Accessing the created sycl device buffer for the device pointer
-  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
-      const void *ptr) const {
-    return queue_stream()->get_sycl_buffer(ptr);
-  }

  /// This is used to prepare the number of threads and also the number of
  /// threads per block for sycl kernels
@ -966,40 +578,14 @@ struct SyclDevice : public SyclDeviceBase {
  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
    queue_stream()->deallocate_temp(buffer);
  }
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE void deallocate_temp(
-      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+
+  EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const {
    queue_stream()->deallocate_temp(buffer);
  }
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    queue_stream()->deallocate_all();
-  }

  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
-      cl::sycl::access::mode::read_write, data_t>
-  get(data_t *data) const {
-    return queue_stream()->get(data);
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
-      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
-                                        data_t>
-          data) const {
-    return queue_stream()->get(data);
-  }
-
-  /// attach existing buffer
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    return queue_stream()->attach_buffer(buf);
-  }
-  /// detach buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    queue_stream()->detach_buffer(p);
-  }
-  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    return queue_stream()->get_offset(ptr);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
+    return data;
  }

  // some runtime conditions that can be applied here
@ -1029,19 +615,14 @@ struct SyclDevice : public SyclDeviceBase {
    queue_stream()->memset(data, c, n);
  }
  /// the fill function
-  template<typename T>
-  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
    queue_stream()->fill(begin, end, value);
  }
  /// returning the sycl queue
  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
    return queue_stream()->sycl_queue();
  }
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
-    return queue_stream()->program();
-  }
-#endif

  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }

@ -1081,13 +662,6 @@ struct SyclDevice : public SyclDeviceBase {
  EIGEN_STRONG_INLINE void synchronize() const {
    queue_stream()->synchronize();
  }
-  EIGEN_STRONG_INLINE void async_synchronize(
-      cl::sycl::event e = cl::sycl::event()) const {
-    queue_stream()->async_synchronize(e);
-  }
-  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
-    return queue_stream()->get_latest_event();
-  }

  // This function checks if the runtime recorded an error for the
  // underlying stream device.
@ -1109,20 +683,20 @@ struct SyclDevice : public SyclDeviceBase {
    return queue_stream()->getDeviceVendor();
  }
  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
-    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
-        var...);
+  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
+    return queue_stream()
+        ->template binary_kernel_launcher<OutScalar, KernelType>(var...);
  }
  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
-    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
-        var...);
+  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
+    return queue_stream()
+        ->template unary_kernel_launcher<OutScalar, KernelType>(var...);
  }

  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
-    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
-        var...);
+  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
+    return queue_stream()
+        ->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
  }
 };
 }  // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -217,13 +217,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
  ArgType expression() const { return m_expression; }
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_buffer.bind(cgh);
-  }
-  #endif
+  
+


 private:
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -183,12 +183,6 @@ struct TensorEvaluator

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
 protected:
  EvaluatorPointerType m_data;
  Dimensions m_dims;
@ -215,13 +209,7 @@ Eigen::half loadConstant(const Eigen::half* address) {
  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
 }
 #endif
-#ifdef EIGEN_USE_SYCL
-// overload of load constant should be implemented here based on range access
-template <cl::sycl::access::mode AcMd, typename T>
-T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
-  return *address;
-}
-#endif
+
 }  // namespace internal

 // Default evaluator for rvalues
@ -338,12 +326,7 @@ struct TensorEvaluator<const Derived, Device>
  }

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
+
 protected:
  EvaluatorPointerType m_data;
  Dimensions m_dims;
@ -425,13 +408,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>

  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_argImpl.bind(cgh);
-  }
-#endif
-
 private:
  const NullaryOp m_functor;
  TensorEvaluator<ArgType, Device> m_argImpl;
@ -538,14 +514,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
-    m_argImpl.bind(cgh);
-  }
-#endif
-
-
 private:
  const Device EIGEN_DEVICE_REF m_device;
  const UnaryOp m_functor;
@ -678,13 +646,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-  #endif
 private:
  const Device EIGEN_DEVICE_REF m_device;
  const BinaryOp m_functor;
@ -793,15 +754,6 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_arg1Impl.bind(cgh);
-    m_arg2Impl.bind(cgh);
-    m_arg3Impl.bind(cgh);
-  }
-#endif
-
 private:
  const TernaryOp m_functor;
  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -743,7 +743,7 @@ class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
          evaluator,
          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
                                cl::sycl::range<1>(tileSize)),
-          Index(1), range);
+          Index(1), range).wait();
    }
    evaluator.cleanup();
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@ -207,12 +207,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
  }

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif

 private:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -219,13 +219,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  EvaluatorPointerType data() const { return m_buffer; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_buffer.bind(cgh);
-    m_impl.bind(cgh);
-  }
-#endif
 private:
  TensorEvaluator<ArgType, Device> m_impl;
  const ArgType m_op;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -107,22 +107,6 @@ struct GpuDevice;
 struct SyclDevice;

 #ifdef EIGEN_USE_SYCL
-
-template <typename T> struct MakeSYCLPointer {
-  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
-};
-
-template <typename T>
-EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
-constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
-  return data;
-}
-
-template <typename T>
-struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
-template <typename T>
-struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
-
 namespace TensorSycl {
 namespace internal{
 template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -266,11 +266,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>

  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
-#endif
-
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@ -507,14 +507,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@ -228,13 +228,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
 protected:
  Dimensions m_dimensions;
  array<Index, NumDims> m_outputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@ -123,13 +123,6 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
    }
  }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -221,12 +221,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>

  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }

-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-  #endif
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
  NewDimensions m_dimensions;
@ -655,12 +649,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    }
    return NULL;
  }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif

 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
@ -1004,12 +992,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
    return NULL;
  }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -485,13 +485,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
 private:
  struct BlockIteratorState {
    BlockIteratorState()
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@ -269,13 +269,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>

  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh); 
-  }
-#endif
-
 protected:
  Dimensions m_dimensions;
  array<Index, NumDims> m_outputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -541,24 +541,6 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
 template<typename ArgType, typename Device>
 struct TensorReductionEvaluatorBase;

-namespace internal {
-namespace reduction {
-
-template <typename CoeffReturnType, typename Device>
-EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
-    typename std::enable_if<Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type& dev) {
-    return (dev.template get_null_accessor<CoeffReturnType>());
-}
-
-template <typename CoeffReturnType, typename Device>
-EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
-    typename std::enable_if<!Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type&) {
-    return NULL;
-}
-
-}// end namespace reduction
-} // end namespace internal
-
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
@ -623,7 +605,7 @@ static constexpr bool RunningOnGPU = false;
  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
      : m_impl(op.expression(), device),
        m_reducer(op.reducer()),
-        m_result(internal::reduction::get_null_value<CoeffReturnType, Device>(device)),
+        m_result(NULL),
        m_device(device) {
    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@ -911,13 +893,6 @@ static constexpr bool RunningOnGPU = false;
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_result.bind(cgh);
-  }
-#endif

  private:
  template <int, typename, typename> friend struct internal::GenericDimReducer;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@ -96,8 +96,8 @@ struct SecondStepFullReducer {
    // algorithm must be changed if the number of reduce per thread in the
    // second step is greater than 1. Otherwise, the result will be wrong.
    const Index localid = itemID.get_local_id(0);
-    auto aInPtr = aI.get_pointer() + localid;
-    auto aOutPtr = outAcc.get_pointer();
+    auto aInPtr = aI + localid;
+    auto aOutPtr = outAcc;
    CoeffReturnType *scratchptr = scratch.get_pointer();
    CoeffReturnType accumulator = *aInPtr;

@ -146,7 +146,7 @@ class FullReductionKernelFunctor {
  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
      const cl::sycl::nd_item<1> &itemID) const {
-    auto output_ptr = final_output.get_pointer();
+    auto output_ptr = final_output;
    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
    Index globalid = itemID.get_global_id(0);
    Index localid = itemID.get_local_id(0);
@ -185,7 +185,7 @@ class FullReductionKernelFunctor {
  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
      const cl::sycl::nd_item<1> &itemID) const {
-    auto output_ptr = final_output.get_pointer();
+    auto output_ptr = final_output;
    Index globalid = itemID.get_global_id(0);
    Index localid = itemID.get_local_id(0);
    // vectorizable parts
@ -232,7 +232,7 @@ class GenericNondeterministicReducer {
    //This is to bypass the statefull condition in Eigen meanReducer
    Op non_const_functor;
    std::memcpy(&non_const_functor, &functor, sizeof (Op));
-    auto output_accessor_ptr = output_accessor.get_pointer();
+    auto output_accessor_ptr = output_accessor;
    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
    if (globalid < range) {
      CoeffReturnType accum = functor.initialize();
@ -313,9 +313,9 @@ struct PartialReductionKernel {

    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
-    auto scratchPtr = scratch.get_pointer().get();
+    CoeffReturnType*  scratchPtr = scratch.get_pointer();
    auto outPtr =
-        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+        output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
    CoeffReturnType accumulator = op.initialize();

    element_wise_reduce(globalRId, globalPId, accumulator);
@ -387,7 +387,7 @@ struct SecondStepPartialReduction {

    if (globalId >= num_coeffs_to_preserve) return;

-    auto in_ptr = input_accessor.get_pointer() + globalId;
+    auto in_ptr = input_accessor + globalId;

    OutScalar accumulator = op.initialize();
 // num_coeffs_to_reduce is not bigger that 256
@ -395,7 +395,7 @@ struct SecondStepPartialReduction {
      op.reduce(*in_ptr, &accumulator);
      in_ptr += num_coeffs_to_preserve;
    }
-    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+    output_accessor[globalId] = op.finalize(accumulator);
  }
 };  // namespace internal

@ -453,21 +453,18 @@ struct PartialReducerLauncher {
      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
-          num_coeffs_to_reduce);
-
+          num_coeffs_to_reduce).wait();
      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
          SecondStepPartialReductionKernel;
-
      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
          temp_accessor, output,
          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
-          reducer, num_coeffs_to_preserve, rNumGroups);
-
+          reducer, num_coeffs_to_preserve, rNumGroups).wait();
      self.device().deallocate_temp(temp_pointer);
    } else {
      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
-          num_coeffs_to_reduce);
+          num_coeffs_to_reduce).wait();
    }
    return false;
  }
@ -512,20 +509,19 @@ struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
-                                                                      local_range, inputSize, reducer);
-
+                                                                      local_range, inputSize, reducer).wait();
      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
                                                          EvaluatorPointerType, Index, local_range>
          GenericRKernel;
      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
          tmp_global_accessor, data,
          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
-          reducer);
-
+          reducer).wait();    
      dev.deallocate_temp(temp_pointer);
    } else {
      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
-                                                                      reducer);
+                                                                      reducer).wait();
+
    }
  }
 };
@ -574,7 +570,7 @@ struct GenericReducer<Self, Op, Eigen::SyclDevice> {
    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
-        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
+        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)).wait();
    return false;
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -368,13 +368,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
 protected:
  Dimensions m_dimensions;
  array<Index, NumDims> m_strides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -507,13 +507,6 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    m_impl.cleanup();
  }

-#ifdef EIGEN_USE_SYCL
- // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_output.bind(cgh);
-  }
-#endif
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
  const Device EIGEN_DEVICE_REF m_device;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@ -89,8 +89,8 @@ struct ScanKernelFunctor {

  LocalAccessor scratch;
  Evaluator dev_eval;
-  OutAccessor out_accessor;
-  OutAccessor temp_accessor;
+  OutAccessor out_ptr;
+  OutAccessor tmp_ptr;
  const ScanParameters<Index> scanParameters;
  Op accumulator;
  const bool inclusive;
@ -100,8 +100,8 @@ struct ScanKernelFunctor {
                                                          const bool inclusive_)
      : scratch(scratch_),
        dev_eval(dev_eval_),
-        out_accessor(out_accessor_),
-        temp_accessor(temp_accessor_),
+        out_ptr(out_accessor_),
+        tmp_ptr(temp_accessor_),
        scanParameters(scanParameters_),
        accumulator(accumulator_),
        inclusive(inclusive_) {}
@ -131,9 +131,7 @@ struct ScanKernelFunctor {
  first_step_inclusive_Operation(InclusiveOp) const {}

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
-    auto out_ptr = out_accessor.get_pointer();
-    auto tmp_ptr = temp_accessor.get_pointer();
-    auto scratch_ptr = scratch.get_pointer().get();
+

    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
@ -190,7 +188,7 @@ struct ScanKernelFunctor {
          }
          private_offset *= 2;
        }
-        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+        scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
            private_scan[PacketSize - 1 + packetIndex];
        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
        // traverse down tree & build scan
@ -219,9 +217,9 @@ struct ScanKernelFunctor {
          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
          CoeffReturnType accum = accumulator.initialize();
-          accumulator.reduce(scratch_ptr[ai], &accum);
-          accumulator.reduce(scratch_ptr[bi], &accum);
-          scratch_ptr[bi] = accumulator.finalize(accum);
+          accumulator.reduce(scratch[ai], &accum);
+          accumulator.reduce(scratch[bi], &accum);
+          scratch[bi] = accumulator.finalize(accum);
        }
        offset *= 2;
      }
@ -234,10 +232,10 @@ struct ScanKernelFunctor {
                                    scanParameters.non_scan_size +
                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
                                block_id;
-          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+          tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
        }
        // clear the last element
-        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+        scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
      }
      // traverse down tree & build scan
      for (Index d = 1; d < scratch_stride; d *= 2) {
@ -248,10 +246,10 @@ struct ScanKernelFunctor {
          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
          CoeffReturnType accum = accumulator.initialize();
-          accumulator.reduce(scratch_ptr[ai], &accum);
-          accumulator.reduce(scratch_ptr[bi], &accum);
-          scratch_ptr[ai] = scratch_ptr[bi];
-          scratch_ptr[bi] = accumulator.finalize(accum);
+          accumulator.reduce(scratch[ai], &accum);
+          accumulator.reduce(scratch[bi], &accum);
+          scratch[ai] = scratch[bi];
+          scratch[bi] = accumulator.finalize(accum);
        }
      }
      // Synchronise
@ -262,7 +260,7 @@ struct ScanKernelFunctor {
        EIGEN_UNROLL_LOOP
        for (Index i = 0; i < PacketSize; i++) {
          CoeffReturnType accum = private_scan[packetIndex + i];
-          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
          private_scan[packetIndex + i] = accumulator.finalize(accum);
        }
      }
@ -294,22 +292,20 @@ struct ScanAdjustmentKernelFunctor {
  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
      LocalAccessor;
  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
-  InAccessor in_accessor;
-  OutAccessor out_accessor;
+  InAccessor in_ptr;
+  OutAccessor out_ptr;
  const ScanParameters<Index> scanParameters;
  Op accumulator;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
                                                                    OutAccessor out_accessor_,
                                                                    const ScanParameters<Index> scanParameters_,
                                                                    Op accumulator_)
-      : in_accessor(in_accessor_),
-        out_accessor(out_accessor_),
+      : in_ptr(in_accessor_),
+        out_ptr(out_accessor_),
        scanParameters(scanParameters_),
        accumulator(accumulator_) {}

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
-    auto in_ptr = in_accessor.get_pointer();
-    auto out_ptr = out_accessor.get_pointer();

    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
@ -426,7 +422,7 @@ struct SYCLAdjustBlockOffset {
        AdjustFuctor;
    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
                                                                      scan_info.max_elements_per_block,
-                                                                      scan_info.get_scan_parameter(), accumulator);
+                                                                      scan_info.get_scan_parameter(), accumulator).wait();
  }
 };

@ -449,7 +445,7 @@ struct ScanLauncher_impl {
    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
-        scan_info.get_scan_parameter(), accumulator, inclusive);
+        scan_info.get_scan_parameter(), accumulator, inclusive).wait();

    if (scan_info.block_size > 1) {
      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -290,12 +290,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>

  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
      Index input_index,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -223,12 +223,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>

  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@ -256,13 +256,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
    return result;
  }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
 protected:
  // Given the output index, finds the first index in the input tensor used to compute the trace
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -535,12 +535,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }

-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
  {
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -148,9 +148,6 @@ if(EIGEN_TEST_SYCL)
  if(EIGEN_SYCL_REG_N)
    add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
  endif()
-  if(EIGEN_SYCL_USE_PROGRAM_CLASS)
-    add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
-  endif()
  if(EIGEN_SYCL_ASYNC_EXECUTION)
    add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
  endif()
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@ -23,15 +23,6 @@
 #include <stdint.h>
 #include <iostream>

-#ifdef SYCL_COMPILER_IS_DPCPP
-template <typename T>
-struct cl::sycl::is_device_copyable<
-    OffByOneScalar<T>,
-    std::enable_if_t<!(!std::is_trivially_copyable<OffByOneScalar<T>>::value &&
-                       (std::is_const_v<OffByOneScalar<T>> || std::is_volatile_v<OffByOneScalar<T>>))>>
-    : std::true_type {};
-#endif
-
 template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  IndexType sizeDim1 = 100;
@ -39,7 +30,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
-  
+
  // memset
  memset(in1.data(), 1, in1.size() * sizeof(DataType));
  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
@ -47,7 +38,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  for (IndexType i=0; i<in.size(); i++) {
    VERIFY_IS_EQUAL(in(i), in1(i));
  }
-  
+
  // fill
  DataType value = DataType(7);
  std::fill_n(in1.data(), in1.size(), value);
@ -77,31 +68,6 @@ void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
  sycl_device.deallocate(gpu_data);
 }

-template<typename DataType, int DataLayout, typename IndexType>
-void test_device_attach_buffer(const Eigen::SyclDevice &sycl_device) {
-  IndexType sizeDim1 = 100;
-  
-  array<IndexType, 1> tensorRange = {{sizeDim1}};
-  Tensor<DataType, 1, DataLayout, IndexType> in(tensorRange);
-  
-  cl::sycl::buffer<buffer_scalar_t, 1> buffer(cl::sycl::range<1>(sizeDim1 * sizeof(DataType)));
-  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.attach_buffer(buffer));
-  
-  // fill
-  DataType value = DataType(7);
-  std::fill_n(in.data(), in.size(), value);
-  sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
-  
-  // Check that buffer is filled with the correct value.
-  auto reint = buffer.reinterpret<DataType>(cl::sycl::range<1>(sizeDim1));
-  auto access = reint.template get_access<cl::sycl::access::mode::read>();
-  for (IndexType i=0; i<in.size(); i++) {
-    VERIFY_IS_EQUAL(in(i), access[i]);
-  }
-  
-  sycl_device.detach_buffer(gpu_in_data);
-}
-
 template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
@ -112,7 +78,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
  //test_device_exceptions<DataType, RowMajor>(sycl_device);
  /// this test throw an exception. enable it if you want to see the exception
  //test_device_exceptions<DataType, ColMajor>(sycl_device);
-  test_device_attach_buffer<DataType, ColMajor, int64_t>(sycl_device);
 }

 EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {