diff --git a/Eigen/Core b/Eigen/Core index bf0b9c736..3ffe0a612 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -249,7 +249,6 @@ using std::ptrdiff_t; #endif #if defined(EIGEN_USE_SYCL) - #include "src/Core/arch/SYCL/SyclMemoryModel.h" #include "src/Core/arch/SYCL/InteropHeaders.h" #if !defined(EIGEN_DONT_VECTORIZE_SYCL) #include "src/Core/arch/SYCL/PacketMath.h" diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h index 062d50efb..57495a1ae 100644 --- a/Eigen/src/Core/arch/SYCL/PacketMath.h +++ b/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -33,13 +33,9 @@ namespace internal { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \ pload##AlignedType( \ const typename unpacket_traits::type* from) { \ - using scalar = typename unpacket_traits::type; \ - typedef cl::sycl::multi_ptr< \ - const scalar, cl::sycl::access::address_space::generic_space, \ - cl::sycl::access::decorated::no> \ - multi_ptr; \ + auto ptr = cl::sycl::address_space_cast(from);\ packet_type res{}; \ - res.load(0, multi_ptr(from)); \ + res.load(0, ptr); \ return res; \ } @@ -54,11 +50,8 @@ SYCL_PLOAD(cl::sycl::cl_double2, ) template <> \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ scalar* to, const packet_type& from) { \ - typedef cl::sycl::multi_ptr< \ - scalar, cl::sycl::access::address_space::generic_space, \ - cl::sycl::access::decorated::no> \ - multi_ptr; \ - from.store(0, multi_ptr(to)); \ + auto ptr = cl::sycl::address_space_cast(to);\ + from.store(0, ptr); \ } SYCL_PSTORE(float, cl::sycl::cl_float4, ) @@ -370,60 +363,6 @@ inline cl::sycl::cl_double2 pblend( } #endif // SYCL_DEVICE_ONLY -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type -ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, T>& from) { - return ploadt_ro(from.get_pointer()); -} - -#define SYCL_PLOAD(Alignment, AlignedType) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ - const Eigen::TensorSycl::internal::RangeAccess< \ - cl::sycl::access::mode::read_write, \ - typename unpacket_traits::type> \ - from) { \ - return ploadt_ro(from); \ - } -SYCL_PLOAD(Unaligned, u) -SYCL_PLOAD(Aligned, ) -#undef SYCL_PLOAD - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type -ploadt(const Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, - typename unpacket_traits::type>& from) { - return ploadt(from.get_pointer()); -} - -#define SYCL_PSTORE(alignment) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ - const Eigen::TensorSycl::internal::RangeAccess< \ - cl::sycl::access::mode::read_write, \ - typename unpacket_traits::type>& to, \ - const packet_type& from) { \ - pstore##alignment(to.get_pointer(), from); \ - } - -// global space -SYCL_PSTORE() -SYCL_PSTORE(u) - -#undef SYCL_PSTORE - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( - Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, - typename unpacket_traits::type> - to, - const packet_type& from) { - pstoret(to.get_pointer(), from); -} - } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h deleted file mode 100644 index c532c1867..000000000 --- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +++ /dev/null @@ -1,700 +0,0 @@ -/*************************************************************************** - * Copyright (C) 2017 Codeplay Software Limited - * This Source Code Form is subject to the terms of the Mozilla - * Public License v. 2.0. If a copy of the MPL was not distributed - * with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - * - * SyclMemoryModel.h - * - * Description: - * Interface for SYCL buffers to behave as a non-dereferenceable pointer - * Interface for Placeholder accessor to behave as a pointer on both host - * and device - * - * Authors: - * - * Ruyman Reyes Codeplay Software Ltd. - * Mehdi Goli Codeplay Software Ltd. - * Vanya Yaneva Codeplay Software Ltd. - * - **************************************************************************/ - -#if defined(EIGEN_USE_SYCL) && \ - !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H) -#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H - -#include -#ifdef EIGEN_EXCEPTIONS -#include -#endif -#include -#include -#include -#include - -#include "../../InternalHeaderCheck.h" - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -using sycl_acc_target = cl::sycl::access::target; -using sycl_acc_mode = cl::sycl::access::mode; - -/** - * Default values for template arguments - */ -using buffer_data_type_t = uint8_t; -const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer; -const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write; - -/** - * PointerMapper - * Associates fake pointers with buffers. - * - */ -class PointerMapper { - public: - using base_ptr_t = std::intptr_t; - - /* Structure of a virtual pointer - * - * |================================================| - * | POINTER ADDRESS | - * |================================================| - */ - struct virtual_pointer_t { - /* Type for the pointers - */ - base_ptr_t m_contents; - - /** Conversions from virtual_pointer_t to - * void * should just reinterpret_cast the integer number - */ - operator void *() const { return reinterpret_cast(m_contents); } - - /** - * Convert back to the integer number. - */ - operator base_ptr_t() const { return m_contents; } - - /** - * Add a certain value to the pointer to create a - * new pointer to that offset - */ - virtual_pointer_t operator+(size_t off) { return m_contents + off; } - - /* Numerical order for sorting pointers in containers. */ - bool operator<(virtual_pointer_t rhs) const { - return (static_cast(m_contents) < - static_cast(rhs.m_contents)); - } - - bool operator>(virtual_pointer_t rhs) const { - return (static_cast(m_contents) > - static_cast(rhs.m_contents)); - } - - /** - * Numerical order for sorting pointers in containers - */ - bool operator==(virtual_pointer_t rhs) const { - return (static_cast(m_contents) == - static_cast(rhs.m_contents)); - } - - /** - * Simple forward to the equality overload. - */ - bool operator!=(virtual_pointer_t rhs) const { - return !(this->operator==(rhs)); - } - - /** - * Converts a void * into a virtual pointer structure. - * Note that this will only work if the void * was - * already a virtual_pointer_t, but we have no way of - * checking - */ - virtual_pointer_t(const void *ptr) - : m_contents(reinterpret_cast(ptr)){}; - - /** - * Creates a virtual_pointer_t from the given integer - * number - */ - virtual_pointer_t(base_ptr_t u) : m_contents(u){}; - }; - - /* Definition of a null pointer - */ - const virtual_pointer_t null_virtual_ptr = nullptr; - - /** - * Whether if a pointer is null or not. - * A pointer is nullptr if the value is of null_virtual_ptr - */ - static inline bool is_nullptr(virtual_pointer_t ptr) { - return (static_cast(ptr) == nullptr); - } - - /* basic type for all buffers - */ - using buffer_t = cl::sycl::buffer; - - /** - * Node that stores information about a device allocation. - * Nodes are sorted by size to organise a free list of nodes - * that can be recovered. - */ - struct pMapNode_t { - buffer_t m_buffer; - size_t m_size; - bool m_free; - - pMapNode_t(buffer_t b, size_t size, bool f) - : m_buffer{b}, m_size{size}, m_free{f} { - m_buffer.set_final_data(nullptr); - } - - bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); } - }; - - /** Storage of the pointer / buffer tree - */ - using pointerMap_t = std::map; - - /** - * Obtain the insertion point in the pointer map for - * a pointer of the given size. - * \param requiredSize Size attempted to reclaim - */ - typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) { - typename pointerMap_t::iterator retVal; - bool reuse = false; - if (!m_freeList.empty()) { - // try to re-use an existing block - for (auto freeElem : m_freeList) { - if (freeElem->second.m_size >= requiredSize) { - retVal = freeElem; - reuse = true; - // Element is not going to be free anymore - m_freeList.erase(freeElem); - break; - } - } - } - if (!reuse) { - retVal = std::prev(m_pointerMap.end()); - } - return retVal; - } - - /** - * Returns an iterator to the node that stores the information - * of the given virtual pointer from the given pointer map structure. - * If pointer is not found, throws std::out_of_range. - * If the pointer map structure is empty, throws std::out_of_range - * - * \param pMap the pointerMap_t structure storing all the pointers - * \param virtual_pointer_ptr The virtual pointer to obtain the node of - * \throws std::out:of_range if the pointer is not found or pMap is empty - */ - typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) { - if (this->count() == 0) { - m_pointerMap.clear(); - EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n")); - - } - if (is_nullptr(ptr)) { - m_pointerMap.clear(); - EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n")); - } - // The previous element to the lower bound is the node that - // holds this memory address - auto node = m_pointerMap.lower_bound(ptr); - // If the value of the pointer is not the one of the node - // then we return the previous one - if (node == std::end(m_pointerMap)) { - --node; - } else if (node->first != ptr) { - if (node == std::begin(m_pointerMap)) { - m_pointerMap.clear(); - EIGEN_THROW_X( - std::out_of_range("The pointer is not registered in the map\n")); - } - --node; - } - - return node; - } - - /* get_buffer. - * Returns a buffer from the map using the pointer address - */ - template - cl::sycl::buffer get_buffer( - const virtual_pointer_t ptr) { - - auto node = get_node(ptr); - auto& map_node = node->second; - eigen_assert(node->first == ptr || node->first < ptr); - eigen_assert(ptr < static_cast(map_node.m_size + - node->first)); - return map_node.m_buffer.reinterpret( - cl::sycl::range<1>{map_node.m_size / sizeof(buffer_data_type)}); - } - - /** - * @brief Returns an accessor to the buffer of the given virtual pointer - * @param accessMode - * @param accessTarget - * @param ptr The virtual pointer - */ - template - cl::sycl::accessor - get_access(const virtual_pointer_t ptr) { - auto buf = get_buffer(ptr); - return buf.template get_access(); - } - - /** - * @brief Returns an accessor to the buffer of the given virtual pointer - * in the given command group scope - * @param accessMode - * @param accessTarget - * @param ptr The virtual pointer - * @param cgh Reference to the command group scope - */ - template - cl::sycl::accessor - get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) { - auto buf = get_buffer(ptr); - return buf.template get_access(cgh); - } - - /* - * Returns the offset from the base address of this pointer. - */ - inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) { - // The previous element to the lower bound is the node that - // holds this memory address - auto node = get_node(ptr); - auto start = node->first; - eigen_assert(start == ptr || start < ptr); - eigen_assert(ptr < start + node->second.m_size); - return (ptr - start); - } - - /* - * Returns the number of elements by which the given pointer is offset from - * the base address. - */ - template - inline size_t get_element_offset(const virtual_pointer_t ptr) { - return get_offset(ptr) / sizeof(buffer_data_type); - } - - /** - * Constructs the PointerMapper structure. - */ - PointerMapper(base_ptr_t baseAddress = 4096) - : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} { - if (m_baseAddress == 0) { - EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n")); - } - }; - - /** - * PointerMapper cannot be copied or moved - */ - PointerMapper(const PointerMapper &) = delete; - - /** - * Empty the pointer list - */ - inline void clear() { - m_freeList.clear(); - m_pointerMap.clear(); - } - - /* add_pointer. - * Adds an existing pointer to the map and returns the virtual pointer id. - */ - inline virtual_pointer_t add_pointer(const buffer_t &b) { - return add_pointer_impl(b); - } - - /* add_pointer. - * Adds a pointer to the map and returns the virtual pointer id. - */ - inline virtual_pointer_t add_pointer(buffer_t &&b) { - return add_pointer_impl(b); - } - - /** - * @brief Fuses the given node with the previous nodes in the - * pointer map if they are free - * - * @param node A reference to the free node to be fused - */ - void fuse_forward(typename pointerMap_t::iterator &node) { - while (node != std::prev(m_pointerMap.end())) { - // if following node is free - // remove it and extend the current node with its size - auto fwd_node = std::next(node); - if (!fwd_node->second.m_free) { - break; - } - auto fwd_size = fwd_node->second.m_size; - m_freeList.erase(fwd_node); - m_pointerMap.erase(fwd_node); - - node->second.m_size += fwd_size; - } - } - - /** - * @brief Fuses the given node with the following nodes in the - * pointer map if they are free - * - * @param node A reference to the free node to be fused - */ - void fuse_backward(typename pointerMap_t::iterator &node) { - while (node != m_pointerMap.begin()) { - // if previous node is free, extend it - // with the size of the current one - auto prev_node = std::prev(node); - if (!prev_node->second.m_free) { - break; - } - prev_node->second.m_size += node->second.m_size; - - // remove the current node - m_freeList.erase(node); - m_pointerMap.erase(node); - - // point to the previous node - node = prev_node; - } - } - - /* remove_pointer. - * Removes the given pointer from the map. - * The pointer is allowed to be reused only if ReUse if true. - */ - template - void remove_pointer(const virtual_pointer_t ptr) { - if (is_nullptr(ptr)) { - return; - } - auto node = this->get_node(ptr); - - node->second.m_free = true; - m_freeList.emplace(node); - - // Fuse the node - // with free nodes before and after it - fuse_forward(node); - fuse_backward(node); - - // If after fusing the node is the last one - // simply remove it (since it is free) - if (node == std::prev(m_pointerMap.end())) { - m_freeList.erase(node); - m_pointerMap.erase(node); - } - } - - /* count. - * Return the number of active pointers (i.e, pointers that - * have been malloc but not freed). - */ - size_t count() const { return (m_pointerMap.size() - m_freeList.size()); } - - private: - /* add_pointer_impl. - * Adds a pointer to the map and returns the virtual pointer id. - * BufferT is either a const buffer_t& or a buffer_t&&. - */ - template - virtual_pointer_t add_pointer_impl(BufferT b) { - virtual_pointer_t retVal = nullptr; - size_t bufSize = b.get_count() * sizeof(buffer_data_type_t); - auto byte_buffer = - b.template reinterpret(cl::sycl::range<1>{bufSize}); - pMapNode_t p{byte_buffer, bufSize, false}; - - // If this is the first pointer: - if (m_pointerMap.empty()) { - virtual_pointer_t initialVal{m_baseAddress}; - m_pointerMap.emplace(initialVal, p); - return initialVal; - } - - auto lastElemIter = get_insertion_point(bufSize); - // We are recovering an existing free node - if (lastElemIter->second.m_free) { - lastElemIter->second.m_buffer = b; - lastElemIter->second.m_free = false; - - // If the recovered node is bigger than the inserted one - // add a new free node with the remaining space - if (lastElemIter->second.m_size > bufSize) { - // create a new node with the remaining space - auto remainingSize = lastElemIter->second.m_size - bufSize; - pMapNode_t p2{b, remainingSize, true}; - - // update size of the current node - lastElemIter->second.m_size = bufSize; - - // add the new free node - auto newFreePtr = lastElemIter->first + bufSize; - auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first; - m_freeList.emplace(freeNode); - } - - retVal = lastElemIter->first; - } else { - size_t lastSize = lastElemIter->second.m_size; - retVal = lastElemIter->first + lastSize; - m_pointerMap.emplace(retVal, p); - } - return retVal; - } - - /** - * Compare two iterators to pointer map entries according to - * the size of the allocation on the device. - */ - struct SortBySize { - bool operator()(typename pointerMap_t::iterator a, - typename pointerMap_t::iterator b) const { - return ((a->first < b->first) && (a->second <= b->second)) || - ((a->first < b->first) && (b->second <= a->second)); - } - }; - - /* Maps the pointer addresses to buffer and size pairs. - */ - pointerMap_t m_pointerMap; - - /* List of free nodes available for re-using - */ - std::set m_freeList; - - /* Base address used when issuing the first virtual pointer, allows users - * to specify alignment. Cannot be zero. */ - std::intptr_t m_baseAddress; -}; - -/* remove_pointer. - * Removes the given pointer from the map. - * The pointer is allowed to be reused only if ReUse if true. - */ -template <> -inline void PointerMapper::remove_pointer(const virtual_pointer_t ptr) { - if (is_nullptr(ptr)) { - return; - } - m_pointerMap.erase(this->get_node(ptr)); -} - -/** - * Malloc-like interface to the pointer-mapper. - * Given a size, creates a byte-typed buffer and returns a - * fake pointer to keep track of it. - * \param size Size in bytes of the desired allocation - * \throw cl::sycl::exception if error while creating the buffer - */ -inline void *SYCLmalloc(size_t size, PointerMapper &pMap) { - if (size == 0) { - return nullptr; - } - // Create a generic buffer of the given size - using buffer_t = cl::sycl::buffer; - auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size})); - // Store the buffer on the global list - return static_cast(thePointer); -} - -/** - * Free-like interface to the pointer mapper. - * Given a fake-pointer created with the virtual-pointer malloc, - * destroys the buffer and remove it from the list. - * If ReUse is false, the pointer is not added to the freeList, - * it should be false only for sub-buffers. - */ -template -inline void SYCLfree(void *ptr, PointerMapper &pMap) { - pMap.template remove_pointer(ptr); -} - -/** - * Clear all the memory allocated by SYCL. - */ -template -inline void SYCLfreeAll(PointerMapper &pMap) { - pMap.clear(); -} - -template -struct RangeAccess { - static const auto global_access = cl::sycl::access::target::global_buffer; - static const auto is_place_holder = cl::sycl::access::placeholder::true_t; - typedef T scalar_t; - typedef scalar_t &ref_t; - typedef scalar_t *ptr_t; - - // the accessor type does not necessarily the same as T - typedef cl::sycl::accessor - accessor; - - typedef RangeAccess self_t; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access, - size_t offset, - std::intptr_t virtual_ptr) - : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {} - - RangeAccess(cl::sycl::buffer buff = - cl::sycl::buffer(cl::sycl::range<1>(1))) - : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {} - - // This should be only used for null constructor on the host side - RangeAccess(std::nullptr_t) : RangeAccess() {} - // This template parameter must be removed and scalar_t should be replaced - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const { - typedef cl::sycl::multi_ptr - multi_ptr; - multi_ptr p(access_); - return (p + offset_).get_raw(); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) { - offset_ += (offset); - return *this; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const { - return self_t(access_, offset_ + offset, virtual_ptr_); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const { - return self_t(access_, offset_ - offset, virtual_ptr_); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) { - offset_ -= offset; - return *this; - } - - // THIS IS FOR NULL COMPARISON ONLY - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( - const RangeAccess &lhs, std::nullptr_t) { - return ((lhs.virtual_ptr_ == -1)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( - const RangeAccess &lhs, std::nullptr_t i) { - return !(lhs == i); - } - - // THIS IS FOR NULL COMPARISON ONLY - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( - std::nullptr_t, const RangeAccess &rhs) { - return ((rhs.virtual_ptr_ == -1)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( - std::nullptr_t i, const RangeAccess &rhs) { - return !(i == rhs); - } - // Prefix operator (Increment and return value) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() { - offset_++; - return (*this); - } - - // Postfix operator (Return value and increment) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) { - EIGEN_UNUSED_VARIABLE(i); - self_t temp_iterator(*this); - offset_++; - return temp_iterator; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const { - return (access_.get_count() - offset_); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const { - return offset_; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) { - offset_ = offset; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const { - return *get_pointer(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() { - return *get_pointer(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) { - return *(get_pointer() + x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const { - return *(get_pointer() + x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const { - return reinterpret_cast(virtual_ptr_ + - (offset_ * sizeof(scalar_t))); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const { - return (virtual_ptr_ != -1); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess() { - return RangeAccess(access_, offset_, virtual_ptr_); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - operator RangeAccess() const { - return RangeAccess(access_, offset_, virtual_ptr_); - } - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( - cl::sycl::handler &cgh) const { - cgh.require(access_); - } - - private: - accessor access_; - size_t offset_; - std::intptr_t virtual_ptr_; // the location of the buffer in the map -}; - -template -struct RangeAccess : RangeAccess { - typedef RangeAccess Base; - using Base::Base; -}; - -} // namespace internal -} // namespace TensorSycl -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e47303f58..cc5e22d74 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -531,12 +531,12 @@ #error "NVCC as the target platform for HIPCC is currently not supported." #endif -#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__) // Means the compiler is either nvcc or clang with CUDA enabled #define EIGEN_CUDACC __CUDACC__ #endif -#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__) // Means we are generating code for the device #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif @@ -548,7 +548,7 @@ #define EIGEN_CUDA_SDK_VER 0 #endif -#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__) // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) #define EIGEN_HIPCC __HIPCC__ @@ -557,7 +557,7 @@ // ++ host_defines.h which contains the defines for the __host__ and __device__ macros #include - #if defined(__HIP_DEVICE_COMPILE__) + #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__) // analogous to EIGEN_CUDA_ARCH, but for HIP #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ #endif diff --git a/test/OffByOneScalar.h b/test/OffByOneScalar.h index c0371a6c7..db59ab603 100644 --- a/test/OffByOneScalar.h +++ b/test/OffByOneScalar.h @@ -2,17 +2,13 @@ // A Scalar with internal representation T+1 so that zero is internally // represented by T(1). This is used to test memory fill. // +#pragma once template class OffByOneScalar { public: OffByOneScalar() : val_(1) {} - OffByOneScalar(const OffByOneScalar& other) { - *this = other; - } - OffByOneScalar& operator=(const OffByOneScalar& other) { - val_ = other.val_; - return *this; - } + OffByOneScalar(const OffByOneScalar& other) = default; + OffByOneScalar& operator=(const OffByOneScalar& other) = default; OffByOneScalar(T val) : val_(val + 1) {} OffByOneScalar& operator=(T val) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 0d6032710..1b642312a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -128,12 +128,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: TensorEvaluator m_impl; }; @@ -278,12 +272,6 @@ struct TensorEvaluator, Devic } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_orig_impl.bind(cgh); - } -#endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 3365b72dd..d2f6ee258 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -228,14 +228,6 @@ struct TensorEvaluator, Device> block.cleanup(); } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_leftImpl.bind(cgh); - m_rightImpl.bind(cgh); - } -#endif - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 64c39ca28..5ae0d8172 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -687,13 +687,6 @@ struct TensorEvaluator, Device> const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( - cl::sycl::handler& cgh) const { - m_impl.bind(cgh); - } -#endif private: static constexpr bool IsColMajor = static_cast(Layout) == static_cast(ColMajor); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index e5f3d5eea..dc3fc1847 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -355,12 +355,6 @@ struct TensorEvaluator, Device> return NULL; } } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index dea41dfd2..2be7692a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -279,14 +279,6 @@ struct TensorEvaluator m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 92cbaf6ff..8478cc3c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -61,13 +61,6 @@ struct CoeffLoader { return m_tensor.template packet(index); } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_tensor.bind(cgh); - } - #endif - private: const Tensor m_tensor; }; @@ -97,12 +90,6 @@ struct CoeffLoader { return internal::ploadt_ro(m_data + index); } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } - #endif private: typedef typename Tensor::Scalar Scalar; @@ -250,13 +237,6 @@ class SimpleTensorContractionMapper { return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_tensor.bind(cgh); - } - #endif - const CoeffLoader& tensor() const { return m_tensor; } @@ -508,13 +488,6 @@ class TensorContractionSubMapper { return false; } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_base_mapper.bind(cgh); - } - #endif - const ParentMapper& base_mapper() const { return m_base_mapper; } Index vert_offset() const { return m_vert_offset; } Index horiz_offset() const { return m_horiz_offset; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 526fc816c..a4e0fc702 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -627,7 +627,7 @@ class TensorContractionKernel { ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); - auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) : compute_panel(itemID, thread_properties, out_ptr); @@ -1048,7 +1048,7 @@ struct GeneralVectorTensor { is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; const StorageIndex contractGroupId = is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; - auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); + auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim); const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; @@ -1255,8 +1255,8 @@ struct GeneralScalarContraction { EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const { - auto out_ptr = out_res.get_pointer(); - auto scratch_ptr = scratch.get_pointer().get(); + auto out_ptr = out_res; + OutScalar * scratch_ptr = scratch.get_pointer(); StorageIndex globalid = itemID.get_global_id(0); StorageIndex localid = itemID.get_local_id(0); @@ -1395,13 +1395,13 @@ struct TensorEvaluatorm_i_size, this->m_j_size, this->m_k_size}; typedef internal::TensorContractionInputMapper< LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, - PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer> LhsMapper; typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> + rhs_inner_dim_reordered, Unaligned, MakePointer> RhsMapper; // initialize data mappers @@ -1505,8 +1505,8 @@ struct TensorEvaluator ContractKernelName; - device().template binary_kernel_launcher( - lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); + device().template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim).wait(); } else { typedef TensorSycl::internal::TensorContractionKernel( lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, - triple_dim); + triple_dim).wait(); typedef Eigen::internal::SumReducer Op; auto op = Op(); @@ -1531,8 +1531,7 @@ struct TensorEvaluator(cl::sycl::range<1>(StorageIndex( Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), cl::sycl::range<1>(localRange)), - StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); - + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK).wait(); device().deallocate_temp(temp_pointer); } } @@ -1566,28 +1565,28 @@ struct TensorEvaluator(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - device().template binary_kernel_launcher( - vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + device().template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait(); typedef Eigen::internal::SumReducer Op; typedef TensorSycl::internal::SecondStepPartialReduction ReductionKernel; - device().template unary_kernel_launcher( + device().template unary_kernel_launcher( tmp_global_accessor, buffer, cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), cl::sycl::range<1>(localRange)), - StorageIndex(1), Op(), nonContractDim, cNumGroups); - + StorageIndex(1), Op(), nonContractDim, cNumGroups).wait(); device().deallocate_temp(temp_pointer); } else { typedef Eigen::TensorSycl::internal::GeneralVectorTensor ContractKernelName; - device().template binary_kernel_launcher( - vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + device().template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait(); + } } #endif @@ -1616,19 +1615,18 @@ struct TensorEvaluator(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); device().template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, - thread_range, local_range, K); + thread_range, local_range, K).wait(); typedef Eigen::internal::SumReducer Op; typedef TensorSycl::internal::SecondStepFullReducer GenericRKernel; device().template unary_kernel_launcher( tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); - + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()).wait(); device().deallocate_temp(temp_pointer); } else { device().template binary_kernel_launcher(lhs, rhs, buffer, thread_range, - local_range, K); + local_range, K).wait(); } } #endif @@ -1642,12 +1640,6 @@ struct TensorEvaluatorm_result = NULL; } } - // The placeholder accessors must bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - this->m_leftImpl.bind(cgh); - this->m_rightImpl.bind(cgh); - this->m_result.bind(cgh); - } }; } // namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index bfb7b12d1..d59c2c35d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -442,12 +442,6 @@ struct TensorEvaluator, Device> /// required by sycl in order to extract the sycl accessor const TensorEvaluator& impl() const { return m_impl; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 7ccb17423..f2795688a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -61,8 +61,8 @@ struct EigenConvolutionKernel itemID) const { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; // the required row to be calculated for the for each plane in shered memory const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; @@ -128,8 +128,8 @@ struct EigenConvolutionKernel itemID) const { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; // the required row to be calculated for the for each plane in shered memory const auto num_input = cl::sycl::range<2>{ (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; @@ -216,8 +216,8 @@ struct EigenConvolutionKernel itemID) const { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; @@ -411,7 +411,7 @@ struct TensorEvaluator( m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])).wait(); break; } @@ -442,7 +442,7 @@ struct TensorEvaluator( m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}).wait(); break; } @@ -482,7 +482,7 @@ struct TensorEvaluator( m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP).wait(); break; } @@ -519,13 +519,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_result.bind(cgh); - } -#endif - protected: void evalTo(EvaluatorPointerType data) { TensorMap > result(m_device.get(data), m_dimensions); @@ -324,12 +317,6 @@ struct TensorEvaluator()), max_work_item_sizes( queue.get_device() - .template get_info>()), + .template get_info< + cl::sycl::info::device::max_work_item_sizes<3>>()), max_mem_alloc_size( queue.get_device() .template get_info< @@ -69,7 +70,6 @@ struct SyclDeviceInfo { } // end namespace internal } // end namespace TensorSycl -typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently // TensorFlow via the Eigen SYCL Backend. @@ -110,147 +110,52 @@ class QueueInterface { explicit QueueInterface( const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, unsigned num_threads = std::thread::hardware_concurrency()) - : m_queue(dev_or_sel, handler), -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog(m_queue.get_context(), get_sycl_supported_devices()), -#endif + : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}}, m_thread_pool(num_threads), - m_device_info(m_queue) { -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog.build_with_kernel_type(); - auto f = [&](cl::sycl::handler &cgh) { - cgh.single_task(m_prog.get_kernel(), - [=]() {}) - }; - EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); -#endif - } + m_device_info(m_queue) {} template explicit QueueInterface( const DeviceOrSelector &dev_or_sel, unsigned num_threads = std::thread::hardware_concurrency()) - : QueueInterface(dev_or_sel, - [this](cl::sycl::exception_list l) { - this->exception_caught_ = this->sycl_async_handler(l); - }, - num_threads) {} - + : QueueInterface( + dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} + explicit QueueInterface( - const cl::sycl::queue& q, unsigned num_threads = std::thread::hardware_concurrency()) - : m_queue(q), -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog(m_queue.get_context(), get_sycl_supported_devices()), -#endif - m_thread_pool(num_threads), - m_device_info(m_queue) {} + const cl::sycl::queue &q, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {} -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } -#endif - - /// Attach an existing buffer to the pointer map, Eigen will not reuse it - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer &buf) const { - std::lock_guard lock(pmapper_mutex_); - return static_cast(pMapper.add_pointer(buf)); - } - - /// Detach previously attached buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - std::lock_guard lock(pmapper_mutex_); - TensorSycl::internal::SYCLfree(p, pMapper); - } - - /// Allocating device pointer. This pointer is actually an 8 bytes host - /// pointer used as key to access the sycl device buffer. The reason is that - /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode - /// expressions. So we create a key pointer to be used in Eigen expression - /// construction. When we convert the Eigen construction into the sycl - /// construction we use this pointer as a key in our buffer_map and we make - /// sure that we dedicate only one buffer only for this pointer. The device - /// pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { #if EIGEN_MAX_ALIGN_BYTES > 0 - size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; - if (align > 0) { - num_bytes += EIGEN_MAX_ALIGN_BYTES - align; - } + return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES, + num_bytes, m_queue); +#else + return (void *)cl::sycl::malloc_device(num_bytes, m_queue); #endif - std::lock_guard lock(pmapper_mutex_); - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); } EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { -#if EIGEN_MAX_ALIGN_BYTES > 0 - size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; - if (align > 0) { - num_bytes += EIGEN_MAX_ALIGN_BYTES - align; - } -#endif - std::lock_guard lock(pmapper_mutex_); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - if (scratch_buffers.empty()) { - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - ; - } else { - for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { - auto buff = pMapper.get_buffer(*it); - if (buff.get_size() >= num_bytes) { - auto ptr = *it; - scratch_buffers.erase(it); - return ptr; - } else { - ++it; - } - } - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - } -#else - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); -#endif - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, data_t> - get(data_t *data) const { - return get_range_accessor(data); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( - TensorSycl::internal::RangeAccess - data) const { - return static_cast(data.get_virtual_pointer()); + return (void *)cl::sycl::malloc_device(num_bytes, m_queue); } - EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { - std::lock_guard lock(pmapper_mutex_); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.insert(p); -#else - TensorSycl::internal::SYCLfree(p, pMapper); -#endif - } - template - EIGEN_STRONG_INLINE void deallocate_temp( - const TensorSycl::internal::RangeAccess &p) const { - deallocate_temp(p.get_virtual_pointer()); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const { + return data; + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); } + + EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const { + deallocate_temp(const_cast(p)); } - /// This is used to deallocate the device pointer. p is used as a key inside - /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard lock(pmapper_mutex_); - TensorSycl::internal::SYCLfree(p, pMapper); - } - - EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard lock(pmapper_mutex_); - TensorSycl::internal::SYCLfreeAll(pMapper); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.clear(); -#endif + cl::sycl::free(p, m_queue); } /// The memcpyHostToDevice is used to copy the data from host to device @@ -260,24 +165,7 @@ class QueueInterface { EIGEN_STRONG_INLINE void memcpyHostToDevice( void *dst, const void *src, size_t n, std::function callback) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; - static const auto global_access = cl::sycl::access::target::global_buffer; - typedef cl::sycl::accessor - write_accessor; - if (n == 0) { - if (callback) callback(); - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - write_accessor dst_acc = get_range_accessor(cgh, dst, n); - buffer_scalar_t const *ptr = static_cast(src); - auto non_deleter = [](buffer_scalar_t const *) {}; - std::shared_ptr s_ptr(ptr, non_deleter); - cgh.copy(s_ptr, dst_acc); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + auto e = m_queue.memcpy(dst, src, n); synchronize_and_callback(e, callback); } @@ -288,24 +176,11 @@ class QueueInterface { EIGEN_STRONG_INLINE void memcpyDeviceToHost( void *dst, const void *src, size_t n, std::function callback) const { - static const auto read_mode = cl::sycl::access::mode::read; - static const auto global_access = cl::sycl::access::target::global_buffer; - typedef cl::sycl::accessor - read_accessor; if (n == 0) { if (callback) callback(); return; } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - read_accessor src_acc = get_range_accessor(cgh, src, n); - buffer_scalar_t *ptr = static_cast(dst); - auto non_deleter = [](buffer_scalar_t *) {}; - std::shared_ptr s_ptr(ptr, non_deleter); - cgh.copy(src_acc, s_ptr); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + auto e = m_queue.memcpy(dst, src, n); synchronize_and_callback(e, callback); } @@ -313,257 +188,88 @@ class QueueInterface { /// No callback is required here as both arguments are on the device /// and SYCL can handle the dependency. EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - static const auto read_mode = cl::sycl::access::mode::read; - static const auto write_mode = cl::sycl::access::mode::discard_write; if (n == 0) { return; } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - auto src_acc = get_range_accessor(cgh, src, n); - auto dst_acc = get_range_accessor(cgh, dst, n); - cgh.copy(src_acc, dst_acc); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); + m_queue.memcpy(dst, src, n).wait(); } /// the memset function. /// No callback is required here as both arguments are on the device /// and SYCL can handle the dependency. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; if (n == 0) { return; } - auto f = [&](cl::sycl::handler &cgh) { - // Get a typed range accesser to ensure we fill each byte, in case - // `buffer_scalar_t` is not (u)int8_t. - auto dst_acc = get_typed_range_accessor(cgh, data, n); - cgh.fill(dst_acc, static_cast(c)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); + m_queue.memset(data, c, n).wait(); } - template - EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; + template + EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const { if (begin == end) { return; } - const ptrdiff_t count = end - begin; - auto f = [&](cl::sycl::handler &cgh) { - auto dst_acc = get_typed_range_accessor(cgh, begin, count); - cgh.fill(dst_acc, value); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); - } - - /// Get a range accessor to the virtual pointer's device memory. This range - /// accessor will allow access to the memory from the pointer to the end of - /// the buffer. - /// - /// NOTE: Inside a kernel the range accessor will always be indexed from the - /// start of the buffer, so the offset in the accessor is only used by - /// methods like handler::copy and will not be available inside a kernel. - template - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess - get_range_accessor(const void *ptr) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - static const auto is_place_holder = cl::sycl::access::placeholder::true_t; - typedef TensorSycl::internal::RangeAccess ret_type; - typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; - - std::lock_guard lock(pmapper_mutex_); - - auto original_buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)"); - eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)"); - const ptrdiff_t typed_offset = offset / sizeof(T); - eigen_assert(typed_offset >= 0); - const auto typed_size = original_buffer.get_size() / sizeof(T); - auto buffer = original_buffer.template reinterpret< - std::remove_const_t>( - cl::sycl::range<1>(typed_size)); - const ptrdiff_t size = buffer.get_count() - typed_offset; - eigen_assert(size >= 0); - typedef cl::sycl::accessor, - 1, AcMd, global_access, is_place_holder> - placeholder_accessor_t; - const auto start_ptr = static_cast(ptr) - offset; - return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), - cl::sycl::id<1>(typed_offset)), - static_cast(typed_offset), - reinterpret_cast(start_ptr)); - } - - /// Get a range accessor to the virtual pointer's device memory with a - /// specified size. - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_range_accessor(cl::sycl::handler &cgh, const void *ptr, - const Index n_bytes) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - eigen_assert(n_bytes >= 0); - std::lock_guard lock(pmapper_mutex_); - auto buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - eigen_assert(offset >= 0); - eigen_assert(offset + n_bytes <= buffer.get_size()); - return buffer.template get_access( - cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); - } - - /// Get a range accessor to the virtual pointer's device memory with a - /// specified type and count. - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - T, 1, AcMd, cl::sycl::access::target::global_buffer> - get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr, - const Index count) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - eigen_assert(count >= 0); - std::lock_guard lock(pmapper_mutex_); - auto buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - eigen_assert(offset >= 0); - - // Technically we should create a subbuffer for the desired range, - // then reinterpret that. However, I was not able to get changes to reflect - // in the original buffer (only the subbuffer and reinterpretted buffer). - // This current implementation now has the restriction that the buffer - // offset and original buffer size must be a multiple of sizeof(T). - // Note that get_range_accessor(void*) currently has the same restriction. - // - // auto subbuffer = cl::sycl::buffer(buffer, - // cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes)); - eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)"); - eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)"); - const ptrdiff_t typed_offset = offset / sizeof(T); - const size_t typed_size = buffer.get_size() / sizeof(T); - auto reint = buffer.template reinterpret< - std::remove_const_t>( - cl::sycl::range<1>(typed_size)); - return reint.template get_access( - cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset)); - } - - /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if - /// not, the function then adds an entry by creating a sycl buffer for that - /// particular pointer. - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { - std::lock_guard lock(pmapper_mutex_); - return pMapper.get_buffer(ptr) - .template get_access( - cgh); - } - - EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( - const void *ptr) const { - std::lock_guard lock(pmapper_mutex_); - return pMapper.get_buffer(ptr); - } - - EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - std::lock_guard lock(pmapper_mutex_); - return pMapper.get_offset(ptr); + const size_t count = end - begin; + m_queue.fill(begin, value, count).wait(); } template - EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, - const Rhs &rhs, OutPtr outptr, - Range thread_range, - Index scratchSize, - T... var) const { + EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher( + const Lhs &lhs, const Rhs &rhs, OutPtr outptr, Range thread_range, + Index scratchSize, T... var) const { auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - lhs.bind(cgh); - rhs.bind(cgh); - outptr.bind(cgh); typedef cl::sycl::accessor LocalAccessor; LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel(), -#endif - thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + cgh.parallel_for(thread_range, + sycl_kernel(scratch, lhs, rhs, outptr, var...)); }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); + + return m_queue.submit(kernel_functor); } template - EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, - OutPtr &outptr, - Range thread_range, - Index scratchSize, - T... var) const { + EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - inptr.bind(cgh); - outptr.bind(cgh); typedef cl::sycl::accessor LocalAccessor; LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel(), -#endif - thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + cgh.parallel_for(thread_range, + sycl_kernel(scratch, inptr, outptr, var...)); }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); + return m_queue.submit(kernel_functor); } - template - EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, - Range thread_range, - Index scratchSize, - T... var) const { + template + EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher( + const InPtr &inptr, Range thread_range, Index scratchSize, + T... var) const { auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - inptr.bind(cgh); typedef cl::sycl::accessor LocalAccessor; LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel(), -#endif - thread_range, sycl_kernel(scratch, inptr, var...)); + cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...)); }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } + return m_queue.submit(kernel_functor); + } EIGEN_STRONG_INLINE void synchronize() const { #ifdef EIGEN_EXCEPTIONS @@ -573,14 +279,6 @@ class QueueInterface { #endif } - - EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { - set_latest_event(e); -#ifndef EIGEN_SYCL_ASYNC_EXECUTION - synchronize(); -#endif - } - template EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { @@ -776,48 +474,9 @@ class QueueInterface { return !exception_caught_; } - EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - std::lock_guard lock(event_mutex_); - return latest_events_[std::this_thread::get_id()]; -#else - eigen_assert(false); - return cl::sycl::event(); -#endif - } - - // destructor - ~QueueInterface() { - pMapper.clear(); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.clear(); -#endif - } - - template - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess get_null_accessor() - const { - eigen_assert(null_buff_simulator.get_size() % sizeof(T) == 0 && "The null buffer size must be a multiple of sizeof(T)"); - const ptrdiff_t typed_size = null_buff_simulator.get_size() / sizeof(T); - eigen_assert(typed_size > 0); - auto typed_null_buff = - null_buff_simulator.template reinterpret(cl::sycl::range<1>(typed_size)); - return TensorSycl::internal::RangeAccess(typed_null_buff); - } - protected: - EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - std::lock_guard lock(event_mutex_); - latest_events_[std::this_thread::get_id()] = e; -#else - EIGEN_UNUSED_VARIABLE(e); -#endif - } - void synchronize_and_callback(cl::sycl::event e, const std::function &callback) const { - set_latest_event(e); if (callback) { auto callback_ = [=]() { #ifdef EIGEN_EXCEPTIONS @@ -850,29 +509,8 @@ class QueueInterface { /// class members: bool exception_caught_ = false; - - mutable std::mutex pmapper_mutex_; - -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - mutable std::mutex event_mutex_; - mutable std::unordered_map latest_events_; -#endif - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of - /// SyclDevice. If a non-read-only pointer is needed to be accessed on the - /// host we should manually deallocate it. - mutable TensorSycl::internal::PointerMapper pMapper; - cl::sycl::buffer null_buff_simulator = cl::sycl::buffer(cl::sycl::range<1>(128)); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - mutable std::unordered_set scratch_buffers; -#endif /// sycl queue mutable cl::sycl::queue m_queue; -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - mutable cl::sycl::program m_prog; -#endif - /// The thread pool is used to wait on events and call callbacks /// asynchronously mutable Eigen::ThreadPool m_thread_pool; @@ -896,32 +534,6 @@ struct SyclDeviceBase { struct SyclDevice : public SyclDeviceBase { explicit SyclDevice(const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {} - - template - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess - get_null_accessor() const { - return queue_stream()->template get_null_accessor(); - } - // this is the accessor used to construct the evaluator - template - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess - get_range_accessor(const void *ptr) const { - return queue_stream()->template get_range_accessor(ptr); - } - - // get sycl accessor - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { - return queue_stream()->template get_sycl_accessor(cgh, ptr); - } - - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( - const void *ptr) const { - return queue_stream()->get_sycl_buffer(ptr); - } /// This is used to prepare the number of threads and also the number of /// threads per block for sycl kernels @@ -966,40 +578,14 @@ struct SyclDevice : public SyclDeviceBase { EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { queue_stream()->deallocate_temp(buffer); } - template - EIGEN_STRONG_INLINE void deallocate_temp( - const TensorSycl::internal::RangeAccess &buffer) const { + + EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const { queue_stream()->deallocate_temp(buffer); } - EIGEN_STRONG_INLINE void deallocate_all() const { - queue_stream()->deallocate_all(); - } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, data_t> - get(data_t *data) const { - return queue_stream()->get(data); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( - TensorSycl::internal::RangeAccess - data) const { - return queue_stream()->get(data); - } - - /// attach existing buffer - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer &buf) const { - return queue_stream()->attach_buffer(buf); - } - /// detach buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - queue_stream()->detach_buffer(p); - } - EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return queue_stream()->get_offset(ptr); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const { + return data; } // some runtime conditions that can be applied here @@ -1029,19 +615,14 @@ struct SyclDevice : public SyclDeviceBase { queue_stream()->memset(data, c, n); } /// the fill function - template - EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { + template + EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const { queue_stream()->fill(begin, end, value); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); } -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - EIGEN_STRONG_INLINE cl::sycl::program &program() const { - return queue_stream()->program(); - } -#endif EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } @@ -1081,13 +662,6 @@ struct SyclDevice : public SyclDeviceBase { EIGEN_STRONG_INLINE void synchronize() const { queue_stream()->synchronize(); } - EIGEN_STRONG_INLINE void async_synchronize( - cl::sycl::event e = cl::sycl::event()) const { - queue_stream()->async_synchronize(e); - } - EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { - return queue_stream()->get_latest_event(); - } // This function checks if the runtime recorded an error for the // underlying stream device. @@ -1109,20 +683,20 @@ struct SyclDevice : public SyclDeviceBase { return queue_stream()->getDeviceVendor(); } template - EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { - queue_stream()->template binary_kernel_launcher( - var...); + EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const { + return queue_stream() + ->template binary_kernel_launcher(var...); } template - EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { - queue_stream()->template unary_kernel_launcher( - var...); + EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const { + return queue_stream() + ->template unary_kernel_launcher(var...); } template - EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { - queue_stream()->template nullary_kernel_launcher( - var...); + EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const { + return queue_stream() + ->template nullary_kernel_launcher(var...); } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 7e13c69e3..461763203 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -217,13 +217,8 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } ArgType expression() const { return m_expression; } - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_buffer.bind(cgh); - } - #endif + + private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b16e5a661..e5beb82a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -183,12 +183,6 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif protected: EvaluatorPointerType m_data; Dimensions m_dims; @@ -215,13 +209,7 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif -#ifdef EIGEN_USE_SYCL -// overload of load constant should be implemented here based on range access -template -T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address) { - return *address; -} -#endif + } // namespace internal // Default evaluator for rvalues @@ -338,12 +326,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif + protected: EvaluatorPointerType m_data; Dimensions m_dims; @@ -425,13 +408,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_argImpl.bind(cgh); - } -#endif - private: const NullaryOp m_functor; TensorEvaluator m_argImpl; @@ -538,14 +514,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ - m_argImpl.bind(cgh); - } -#endif - - private: const Device EIGEN_DEVICE_REF m_device; const UnaryOp m_functor; @@ -678,13 +646,6 @@ struct TensorEvaluator m_arg1Impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 92d04f690..d8b99e963 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -743,7 +743,7 @@ class TensorExecutor { evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - Index(1), range); + Index(1), range).wait(); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index c744d7955..4eaa60b70 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -207,12 +207,6 @@ struct TensorEvaluator, D } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 4550fcab0..facf7d1c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -219,13 +219,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buffer; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_buffer.bind(cgh); - m_impl.bind(cgh); - } -#endif private: TensorEvaluator m_impl; const ArgType m_op; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index b52833cea..4069b7de0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -107,22 +107,6 @@ struct GpuDevice; struct SyclDevice; #ifdef EIGEN_USE_SYCL - -template struct MakeSYCLPointer { - typedef Eigen::TensorSycl::internal::RangeAccess Type; -}; - -template -EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess& -constCast(const Eigen::TensorSycl::internal::RangeAccess& data) { - return data; -} - -template -struct StorageMemory : MakeSYCLPointer {}; -template -struct StorageMemory : StorageMemory {}; - namespace TensorSycl { namespace internal{ template class GenericNondeterministicReducer; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 9b8469b98..83f20eb18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -266,11 +266,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} -#endif - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index a9b281fef..3ce021c1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -507,14 +507,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 818819408..7266305ec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -228,13 +228,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: Dimensions m_dimensions; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 8b1408599..5a727bc82 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -123,13 +123,6 @@ struct TensorEvaluator, Device> } } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index b696190b2..4445343e4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -221,12 +221,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } - #endif protected: TensorEvaluator m_impl; NewDimensions m_dimensions; @@ -655,12 +649,6 @@ struct TensorEvaluator, Devi } return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const @@ -1004,12 +992,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - private: struct BlockIteratorState { BlockIteratorState() diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 413c5c48e..61a6c9842 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -269,13 +269,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: Dimensions m_dimensions; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fdb473336..8cd786722 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -541,24 +541,6 @@ class TensorReductionOp : public TensorBase struct TensorReductionEvaluatorBase; -namespace internal { -namespace reduction { - -template -EIGEN_ALWAYS_INLINE typename StorageMemory::Type get_null_value( - typename std::enable_if::value, const Device>::type& dev) { - return (dev.template get_null_accessor()); -} - -template -EIGEN_ALWAYS_INLINE typename StorageMemory::Type get_null_value( - typename std::enable_if::value, const Device>::type&) { - return NULL; -} - -}// end namespace reduction -} // end namespace internal - // Eval as rvalue template class MakePointer_, typename Device> struct TensorReductionEvaluatorBase, Device> @@ -623,7 +605,7 @@ static constexpr bool RunningOnGPU = false; EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()), - m_result(internal::reduction::get_null_value(device)), + m_result(NULL), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -911,13 +893,6 @@ static constexpr bool RunningOnGPU = false; EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_result.bind(cgh); - } -#endif private: template friend struct internal::GenericDimReducer; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 715797d62..ea3c2dce5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -96,8 +96,8 @@ struct SecondStepFullReducer { // algorithm must be changed if the number of reduce per thread in the // second step is greater than 1. Otherwise, the result will be wrong. const Index localid = itemID.get_local_id(0); - auto aInPtr = aI.get_pointer() + localid; - auto aOutPtr = outAcc.get_pointer(); + auto aInPtr = aI + localid; + auto aOutPtr = outAcc; CoeffReturnType *scratchptr = scratch.get_pointer(); CoeffReturnType accumulator = *aInPtr; @@ -146,7 +146,7 @@ class FullReductionKernelFunctor { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( const cl::sycl::nd_item<1> &itemID) const { - auto output_ptr = final_output.get_pointer(); + auto output_ptr = final_output; Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; Index globalid = itemID.get_global_id(0); Index localid = itemID.get_local_id(0); @@ -185,7 +185,7 @@ class FullReductionKernelFunctor { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( const cl::sycl::nd_item<1> &itemID) const { - auto output_ptr = final_output.get_pointer(); + auto output_ptr = final_output; Index globalid = itemID.get_global_id(0); Index localid = itemID.get_local_id(0); // vectorizable parts @@ -232,7 +232,7 @@ class GenericNondeterministicReducer { //This is to bypass the statefull condition in Eigen meanReducer Op non_const_functor; std::memcpy(&non_const_functor, &functor, sizeof (Op)); - auto output_accessor_ptr = output_accessor.get_pointer(); + auto output_accessor_ptr = output_accessor; Index globalid = static_cast(itemID.get_global_linear_id()); if (globalid < range) { CoeffReturnType accum = functor.initialize(); @@ -313,9 +313,9 @@ struct PartialReductionKernel { Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; - auto scratchPtr = scratch.get_pointer().get(); + CoeffReturnType* scratchPtr = scratch.get_pointer(); auto outPtr = - output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); + output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); CoeffReturnType accumulator = op.initialize(); element_wise_reduce(globalRId, globalPId, accumulator); @@ -387,7 +387,7 @@ struct SecondStepPartialReduction { if (globalId >= num_coeffs_to_preserve) return; - auto in_ptr = input_accessor.get_pointer() + globalId; + auto in_ptr = input_accessor + globalId; OutScalar accumulator = op.initialize(); // num_coeffs_to_reduce is not bigger that 256 @@ -395,7 +395,7 @@ struct SecondStepPartialReduction { op.reduce(*in_ptr, &accumulator); in_ptr += num_coeffs_to_preserve; } - output_accessor.get_pointer()[globalId] = op.finalize(accumulator); + output_accessor[globalId] = op.finalize(accumulator); } }; // namespace internal @@ -453,21 +453,18 @@ struct PartialReducerLauncher { EvaluatorPointerType temp_accessor = dev.get(temp_pointer); dev.template unary_kernel_launcher( self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, - num_coeffs_to_reduce); - + num_coeffs_to_reduce).wait(); typedef SecondStepPartialReduction SecondStepPartialReductionKernel; - dev.template unary_kernel_launcher( temp_accessor, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1), - reducer, num_coeffs_to_preserve, rNumGroups); - + reducer, num_coeffs_to_preserve, rNumGroups).wait(); self.device().deallocate_temp(temp_pointer); } else { dev.template unary_kernel_launcher( self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, - num_coeffs_to_reduce); + num_coeffs_to_reduce).wait(); } return false; } @@ -512,20 +509,19 @@ struct FullReducer { static_cast(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); dev.template unary_kernel_launcher(self, tmp_global_accessor, thread_range, - local_range, inputSize, reducer); - + local_range, inputSize, reducer).wait(); typedef TensorSycl::internal::SecondStepFullReducer GenericRKernel; dev.template unary_kernel_launcher( tmp_global_accessor, data, cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group, - reducer); - + reducer).wait(); dev.deallocate_temp(temp_pointer); } else { dev.template unary_kernel_launcher(self, data, thread_range, local_range, inputSize, - reducer); + reducer).wait(); + } } }; @@ -574,7 +570,7 @@ struct GenericReducer { dev.template unary_kernel_launcher>( self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), - reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast(1)); + reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast(1)).wait(); return false; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 342d9e9b5..ee84400e0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -368,13 +368,6 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: Dimensions m_dimensions; array m_strides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 2c574c79f..9d9458a52 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -507,13 +507,6 @@ struct TensorEvaluator, Device> { m_impl.cleanup(); } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_output.bind(cgh); - } -#endif protected: TensorEvaluator m_impl; const Device EIGEN_DEVICE_REF m_device; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h index ecc872b3a..155cc0fdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h @@ -89,8 +89,8 @@ struct ScanKernelFunctor { LocalAccessor scratch; Evaluator dev_eval; - OutAccessor out_accessor; - OutAccessor temp_accessor; + OutAccessor out_ptr; + OutAccessor tmp_ptr; const ScanParameters scanParameters; Op accumulator; const bool inclusive; @@ -100,8 +100,8 @@ struct ScanKernelFunctor { const bool inclusive_) : scratch(scratch_), dev_eval(dev_eval_), - out_accessor(out_accessor_), - temp_accessor(temp_accessor_), + out_ptr(out_accessor_), + tmp_ptr(temp_accessor_), scanParameters(scanParameters_), accumulator(accumulator_), inclusive(inclusive_) {} @@ -131,9 +131,7 @@ struct ScanKernelFunctor { first_step_inclusive_Operation(InclusiveOp) const {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { - auto out_ptr = out_accessor.get_pointer(); - auto tmp_ptr = temp_accessor.get_pointer(); - auto scratch_ptr = scratch.get_pointer().get(); + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); @@ -190,7 +188,7 @@ struct ScanKernelFunctor { } private_offset *= 2; } - scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = + scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = private_scan[PacketSize - 1 + packetIndex]; private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); // traverse down tree & build scan @@ -219,9 +217,9 @@ struct ScanKernelFunctor { Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; CoeffReturnType accum = accumulator.initialize(); - accumulator.reduce(scratch_ptr[ai], &accum); - accumulator.reduce(scratch_ptr[bi], &accum); - scratch_ptr[bi] = accumulator.finalize(accum); + accumulator.reduce(scratch[ai], &accum); + accumulator.reduce(scratch[bi], &accum); + scratch[bi] = accumulator.finalize(accum); } offset *= 2; } @@ -234,10 +232,10 @@ struct ScanKernelFunctor { scanParameters.non_scan_size + group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + block_id; - tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset]; + tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset]; } // clear the last element - scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); + scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); } // traverse down tree & build scan for (Index d = 1; d < scratch_stride; d *= 2) { @@ -248,10 +246,10 @@ struct ScanKernelFunctor { Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; CoeffReturnType accum = accumulator.initialize(); - accumulator.reduce(scratch_ptr[ai], &accum); - accumulator.reduce(scratch_ptr[bi], &accum); - scratch_ptr[ai] = scratch_ptr[bi]; - scratch_ptr[bi] = accumulator.finalize(accum); + accumulator.reduce(scratch[ai], &accum); + accumulator.reduce(scratch[bi], &accum); + scratch[ai] = scratch[bi]; + scratch[bi] = accumulator.finalize(accum); } } // Synchronise @@ -262,7 +260,7 @@ struct ScanKernelFunctor { EIGEN_UNROLL_LOOP for (Index i = 0; i < PacketSize; i++) { CoeffReturnType accum = private_scan[packetIndex + i]; - accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); + accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); private_scan[packetIndex + i] = accumulator.finalize(accum); } } @@ -294,22 +292,20 @@ struct ScanAdjustmentKernelFunctor { typedef cl::sycl::accessor LocalAccessor; static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; - InAccessor in_accessor; - OutAccessor out_accessor; + InAccessor in_ptr; + OutAccessor out_ptr; const ScanParameters scanParameters; Op accumulator; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters scanParameters_, Op accumulator_) - : in_accessor(in_accessor_), - out_accessor(out_accessor_), + : in_ptr(in_accessor_), + out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { - auto in_ptr = in_accessor.get_pointer(); - auto out_ptr = out_accessor.get_pointer(); for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); @@ -426,7 +422,7 @@ struct SYCLAdjustBlockOffset { AdjustFuctor; dev.template unary_kernel_launcher(in_ptr, out_ptr, scan_info.get_thread_range(), scan_info.max_elements_per_block, - scan_info.get_scan_parameter(), accumulator); + scan_info.get_scan_parameter(), accumulator).wait(); } }; @@ -449,7 +445,7 @@ struct ScanLauncher_impl { typedef ScanKernelFunctor ScanFunctor; dev.template binary_kernel_launcher( in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, - scan_info.get_scan_parameter(), accumulator, inclusive); + scan_info.get_scan_parameter(), accumulator, inclusive).wait(); if (scan_info.block_size > 1) { ScanLauncher_impl::scan_block( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 3d97e6684..78f9f92cf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -290,12 +290,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( Index input_index, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 681c4e5df..1b9a0f16a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -223,12 +223,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index d68220266..55e8360d5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -256,13 +256,6 @@ struct TensorEvaluator, Device> return result; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - protected: // Given the output index, finds the first index in the input tensor used to compute the trace EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 3523a9431..ac2ad6113 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -535,12 +535,6 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cd9775699..2bb551866 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -148,9 +148,6 @@ if(EIGEN_TEST_SYCL) if(EIGEN_SYCL_REG_N) add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N}) endif() - if(EIGEN_SYCL_USE_PROGRAM_CLASS) - add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS}) - endif() if(EIGEN_SYCL_ASYNC_EXECUTION) add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION}) endif() diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 7ba104b03..2f75ef896 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -23,15 +23,6 @@ #include #include -#ifdef SYCL_COMPILER_IS_DPCPP -template -struct cl::sycl::is_device_copyable< - OffByOneScalar, - std::enable_if_t>::value && - (std::is_const_v> || std::is_volatile_v>))>> - : std::true_type {}; -#endif - template void test_device_memory(const Eigen::SyclDevice &sycl_device) { IndexType sizeDim1 = 100; @@ -39,7 +30,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) { Tensor in(tensorRange); Tensor in1(tensorRange); DataType* gpu_in_data = static_cast(sycl_device.allocate(in.size()*sizeof(DataType))); - + // memset memset(in1.data(), 1, in1.size() * sizeof(DataType)); sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); @@ -47,7 +38,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) { for (IndexType i=0; i -void test_device_attach_buffer(const Eigen::SyclDevice &sycl_device) { - IndexType sizeDim1 = 100; - - array tensorRange = {{sizeDim1}}; - Tensor in(tensorRange); - - cl::sycl::buffer buffer(cl::sycl::range<1>(sizeDim1 * sizeof(DataType))); - DataType* gpu_in_data = static_cast(sycl_device.attach_buffer(buffer)); - - // fill - DataType value = DataType(7); - std::fill_n(in.data(), in.size(), value); - sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value); - - // Check that buffer is filled with the correct value. - auto reint = buffer.reinterpret(cl::sycl::range<1>(sizeDim1)); - auto access = reint.template get_access(); - for (IndexType i=0; i void sycl_device_test_per_device(const cl::sycl::device& d){ std::cout << "Running on " << d.template get_info() << std::endl; QueueInterface queueInterface(d); @@ -112,7 +78,6 @@ template void sycl_device_test_per_device(const cl::sycl::dev //test_device_exceptions(sycl_device); /// this test throw an exception. enable it if you want to see the exception //test_device_exceptions(sycl_device); - test_device_attach_buffer(sycl_device); } EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {