diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index a30090714..722a5d894 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,16 +15,13 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -#include "TensorSyclLegacyPointer.h" - namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) template class MemCopyFunctor { public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) - : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} + MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} void operator()(cl::sycl::nd_item<1> itemID) { auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); @@ -55,7 +52,6 @@ namespace Eigen { }; - EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); @@ -78,10 +74,11 @@ struct QueueInterface { bool exception_caught_ = false; mutable std::mutex mutex_; + /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - //mutable std::map> buffer_map; + mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename @@ -119,42 +116,45 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); + auto ptr =buf.get_access().get_pointer(); + buf.set_final_data(nullptr); std::lock_guard lock(mutex_); - return codeplay::legacy::malloc(num_bytes); + buffer_map.insert(std::pair>(static_cast(ptr),buf)); + return static_cast(ptr); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { std::lock_guard lock(mutex_); - return codeplay::legacy::free(p); + auto it = buffer_map.find(static_cast(p)); + if (it != buffer_map.end()) { + buffer_map.erase(it); + } } EIGEN_STRONG_INLINE void deallocate_all() const { std::lock_guard lock(mutex_); - codeplay::legacy::clear(); + buffer_map.clear(); } - EIGEN_STRONG_INLINE codeplay::legacy::PointerMapper& pointerMapper() const { + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { std::lock_guard lock(mutex_); - return codeplay::legacy::getPointerMapper(); + auto it1 = buffer_map.find(static_cast(ptr)); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = it->second.get_size(); + if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; + } + } + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); } - EIGEN_STRONG_INLINE cl::sycl::buffer get_buffer(void* ptr) const { - std::lock_guard lock(mutex_); - return pointerMapper().get_buffer(pointerMapper().get_buffer_id(ptr)); - } - - EIGEN_STRONG_INLINE size_t get_buffer_offset(void* ptr) const { - std::lock_guard lock(mutex_); - return pointerMapper().get_offset(ptr); - } - - /*EIGEN_STRONG_INLINE void* get_buffer_id(void* ptr) const { - std::lock_guard lock(mutex_); - return static_cast(pointerMapper().get_buffer_id(ptr)); - }*/ - // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -165,7 +165,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } // destructor - ~QueueInterface() { codeplay::legacy::clear(); } + ~QueueInterface() { buffer_map.clear(); } }; struct SyclDevice { @@ -183,11 +183,10 @@ struct SyclDevice { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer(const void * ptr) const { - return m_queue_stream->get_buffer(const_cast(ptr)); + EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + return m_queue_stream->find_buffer(ptr)->second; } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { @@ -274,8 +273,6 @@ struct SyclDevice { if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); } } - - /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -290,15 +287,17 @@ struct SyclDevice { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto offset= m_queue_stream->get_buffer_offset((void*)src); - auto i= m_queue_stream->get_buffer_offset(dst); + auto it1 = m_queue_stream->find_buffer((void*)src); + auto it2 = m_queue_stream->find_buffer(dst); + auto offset= (static_cast(static_cast(src))) - it1->first; + auto i= (static_cast(dst)) - it2->first; offset/=sizeof(Index); i/=sizeof(Index); size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =get_sycl_accessor(cgh, src); - auto dst_acc =get_sycl_accessor(cgh, dst); + auto src_acc =it1->second.template get_access(cgh); + auto dst_acc =it2->second.template get_access(cgh); typedef decltype(src_acc) read_accessor; typedef decltype(dst_acc) write_accessor; cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); @@ -311,11 +310,10 @@ struct SyclDevice { /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. - template EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { auto host_acc= get_sycl_buffer(dst). template get_access(); ::memcpy(host_acc.get_pointer(), src, n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination @@ -323,14 +321,15 @@ struct SyclDevice { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto offset =m_queue_stream->get_buffer_offset((void *)src); + auto it = m_queue_stream->find_buffer(src); + auto offset =static_cast(static_cast(src))- it->first; offset/=sizeof(Index); size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= get_sycl_accessor(cgh, src); + auto src_acc= it->second.template get_access(cgh); auto dst_acc =dest_buf.template get_access(cgh); typedef decltype(src_acc) read_accessor; typedef decltype(dst_acc) write_accessor; @@ -344,8 +343,7 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - auto buf =get_sycl_buffer(static_cast(static_cast(data))); - sycl_queue().submit(memsetCghFunctor(buf,rng, GRange, tileSize, c )); + sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast(static_cast(data))),rng, GRange, tileSize, c )); synchronize(); } @@ -411,6 +409,7 @@ struct SyclDevice { }; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h deleted file mode 100644 index b723592cd..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h +++ /dev/null @@ -1,244 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Ruyman Reyes Codeplay Software Ltd -// Mehdi Goli Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclLegacyPointer.h - * - * \brief: - * Interface for SYCL buffers to behave as a non-deferrenciable pointer - * This can be found in Codeplay's ComputeCpp SDK : legacy_pointer.h - * - **************************************************************************/ - -namespace codeplay { -namespace legacy { - -/** - * PointerMapper - * Associates fake pointers with buffers. - * - */ -class PointerMapper { - public: - /* pointer information definitions - */ - static const unsigned long ADDRESS_BITS = sizeof(void *) * 8; - static const unsigned long BUFFER_ID_BITSIZE = 16u; - static const unsigned long MAX_NUMBER_BUFFERS = (1UL << BUFFER_ID_BITSIZE)-1; - static const unsigned long MAX_OFFSET = (1UL << (ADDRESS_BITS - BUFFER_ID_BITSIZE))-1; - - using base_ptr_t = uintptr_t; - - /* Fake Pointers are constructed using an integer indexing plus - * the offset: - * - * |== MAX_BUFFERS ==|======== MAX_OFFSET ========| - * | Buffer Id | Offset in buffer | - * |=================|============================| - */ - struct legacy_pointer_t { - /* Type for the pointers - */ - base_ptr_t _contents; - - /** Conversions from legacy_pointer_t to - * the void * should just reinterpret_cast the integer - * number - */ - operator void *() const { return reinterpret_cast(_contents); } - - /** - * Convert back to the integer number. - */ - operator base_ptr_t() const { return _contents; } - - /** - * Converts a void * into a legacy pointer structure. - * Note that this will only work if the void * was - * already a legacy_pointer_t, but we have no way of - * checking - */ - legacy_pointer_t(void *ptr) - : _contents(reinterpret_cast(ptr)){}; - - /** - * Creates a legacy_pointer_t from the given integer - * number - */ - legacy_pointer_t(base_ptr_t u) : _contents(u){}; - }; - - /* Whether if a pointer is null or not. - * - * A pointer is nullptr if the buffer id is 0, - * i.e the first BUFFER_ID_BITSIZE are zero - */ - static inline bool is_nullptr(legacy_pointer_t ptr) { - return ((MAX_OFFSET & ptr) == ptr); - } - - /* Base nullptr - */ - const legacy_pointer_t null_legacy_ptr = nullptr; - - /* Data type to create buffer of byte-size elements - */ - using buffer_data_type = uint8_t; - - /* basic type for all buffers - */ - using buffer_t = cl::sycl::buffer; - - /* id of a buffer in the map - */ - typedef short buffer_id; - - /* get_buffer_id - */ - inline buffer_id get_buffer_id(legacy_pointer_t ptr) const { - return ptr >> (ADDRESS_BITS - BUFFER_ID_BITSIZE); - } - - /* - * get_buffer_offset - */ - inline off_t get_offset(legacy_pointer_t ptr) const { - return ptr & MAX_OFFSET;; - } - - /** - * Constructs the PointerMapper structure. - */ - PointerMapper() - : __pointer_list{}, rng_(std::random_device()()), uni_(1, 256){}; - - /** - * PointerMapper cannot be copied or moved - */ - PointerMapper(const PointerMapper &) = delete; - - /** - * empty the pointer list - */ - inline void clear() { - __pointer_list.clear(); - } - - /* generate_id - * Generates a unique id for a buffer. - */ - buffer_id generate_id() { - // Limit the number of attempts to half the combinations - // just to avoid an infinite loop - int numberOfAttempts = 1ul << (BUFFER_ID_BITSIZE / 2); - buffer_id bId; - do { - bId = uni_(rng_); - } while (__pointer_list.find(bId) != __pointer_list.end() && - numberOfAttempts--); - return bId; - } - - /* add_pointer. - * Adds a pointer to the map and returns the fake pointer id. - * This will be the bufferId on the most significant bytes and 0 elsewhere. - */ - legacy_pointer_t add_pointer(buffer_t &&b) { - auto nextNumber = __pointer_list.size(); - buffer_id bId = generate_id(); - __pointer_list.emplace(bId, b); - if (nextNumber > MAX_NUMBER_BUFFERS) { - return null_legacy_ptr; - } - base_ptr_t retVal = bId; - retVal <<= (ADDRESS_BITS - BUFFER_ID_BITSIZE); - return retVal; - } - - /* get_buffer. - * Returns a buffer from the map using the buffer id - */ - buffer_t get_buffer(buffer_id bId) const { - auto it = __pointer_list.find(bId); - if (it != __pointer_list.end()) - return it->second; - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); - } - - /* remove_pointer. - * Removes the given pointer from the map. - */ - void remove_pointer(void *ptr) { - buffer_id bId = this->get_buffer_id(ptr); - __pointer_list.erase(bId); - } - - /* count. - * Return the number of active pointers (i.e, pointers that - * have been malloc but not freed). - */ - size_t count() const { return __pointer_list.size(); } - - private: - /* Maps the buffer id numbers to the actual buffer - * instances. - */ - std::map __pointer_list; - - /* Random number generator for the buffer ids - */ - std::mt19937 rng_; - - /* Random-number engine - */ - std::uniform_int_distribution uni_; -}; - -/** - * Singleton interface to the pointer mapper to implement - * the generic malloc/free C interface without extra - * parameters. - */ -inline PointerMapper &getPointerMapper() { - static PointerMapper thePointerMapper; - return thePointerMapper; -} - -/** - * Malloc-like interface to the pointer-mapper. - * Given a size, creates a byte-typed buffer and returns a - * fake pointer to keep track of it. - */ -inline void *malloc(size_t size) { - // Create a generic buffer of the given size - auto thePointer = getPointerMapper().add_pointer( - PointerMapper::buffer_t(cl::sycl::range<1>{size})); - // Store the buffer on the global list - return static_cast(thePointer); -} - -/** - * Free-like interface to the pointer mapper. - * Given a fake-pointer created with the legacy-pointer malloc, - * destroys the buffer and remove it from the list. - */ -inline void free(void *ptr) { getPointerMapper().remove_pointer(ptr); } - -/** - *clear the pointer list - */ -inline void clear() { - getPointerMapper().clear(); -} - -} // legacy -} // codeplay