[SYCL-2020] Enabling USM support for SYCL. SYCL-1.2.1 did not have support for USM.

This commit is contained in:
Mehdi Goli 2023-05-05 17:30:36 +00:00 committed by Rasmus Munk Larsen
parent 1698c367a0
commit 0623791930
40 changed files with 164 additions and 1707 deletions

View File

@ -249,7 +249,6 @@ using std::ptrdiff_t;
#endif
#if defined(EIGEN_USE_SYCL)
#include "src/Core/arch/SYCL/SyclMemoryModel.h"
#include "src/Core/arch/SYCL/InteropHeaders.h"
#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
#include "src/Core/arch/SYCL/PacketMath.h"

View File

@ -33,13 +33,9 @@ namespace internal {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
pload##AlignedType<packet_type>( \
const typename unpacket_traits<packet_type>::type* from) { \
using scalar = typename unpacket_traits<packet_type>::type; \
typedef cl::sycl::multi_ptr< \
const scalar, cl::sycl::access::address_space::generic_space, \
cl::sycl::access::decorated::no> \
multi_ptr; \
auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(from);\
packet_type res{}; \
res.load(0, multi_ptr(from)); \
res.load(0, ptr); \
return res; \
}
@ -54,11 +50,8 @@ SYCL_PLOAD(cl::sycl::cl_double2, )
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
scalar* to, const packet_type& from) { \
typedef cl::sycl::multi_ptr< \
scalar, cl::sycl::access::address_space::generic_space, \
cl::sycl::access::decorated::no> \
multi_ptr; \
from.store(0, multi_ptr(to)); \
auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(to);\
from.store(0, ptr); \
}
SYCL_PSTORE(float, cl::sycl::cl_float4, )
@ -370,60 +363,6 @@ inline cl::sycl::cl_double2 pblend(
}
#endif // SYCL_DEVICE_ONLY
template <typename packet_type, int Alignment, typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, T>& from) {
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
}
#define SYCL_PLOAD(Alignment, AlignedType) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
const Eigen::TensorSycl::internal::RangeAccess< \
cl::sycl::access::mode::read_write, \
typename unpacket_traits<packet_type>::type> \
from) { \
return ploadt_ro<packet_type, Alignment>(from); \
}
SYCL_PLOAD(Unaligned, u)
SYCL_PLOAD(Aligned, )
#undef SYCL_PLOAD
template <typename packet_type, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write,
typename unpacket_traits<packet_type>::type>& from) {
return ploadt<packet_type, Alignment>(from.get_pointer());
}
#define SYCL_PSTORE(alignment) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
const Eigen::TensorSycl::internal::RangeAccess< \
cl::sycl::access::mode::read_write, \
typename unpacket_traits<packet_type>::type>& to, \
const packet_type& from) { \
pstore##alignment(to.get_pointer(), from); \
}
// global space
SYCL_PSTORE()
SYCL_PSTORE(u)
#undef SYCL_PSTORE
template <typename scalar, typename packet_type, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(
Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write,
typename unpacket_traits<packet_type>::type>
to,
const packet_type& from) {
pstoret<scalar, packet_type, Alignment>(to.get_pointer(), from);
}
} // end namespace internal
} // end namespace Eigen

View File

@ -1,700 +0,0 @@
/***************************************************************************
* Copyright (C) 2017 Codeplay Software Limited
* This Source Code Form is subject to the terms of the Mozilla
* Public License v. 2.0. If a copy of the MPL was not distributed
* with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
*
* SyclMemoryModel.h
*
* Description:
* Interface for SYCL buffers to behave as a non-dereferenceable pointer
* Interface for Placeholder accessor to behave as a pointer on both host
* and device
*
* Authors:
*
* Ruyman Reyes Codeplay Software Ltd.
* Mehdi Goli Codeplay Software Ltd.
* Vanya Yaneva Codeplay Software Ltd.
*
**************************************************************************/
#if defined(EIGEN_USE_SYCL) && \
!defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H)
#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
#include <CL/sycl.hpp>
#ifdef EIGEN_EXCEPTIONS
#include <stdexcept>
#endif
#include <cstddef>
#include <queue>
#include <set>
#include <unordered_map>
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace TensorSycl {
namespace internal {
using sycl_acc_target = cl::sycl::access::target;
using sycl_acc_mode = cl::sycl::access::mode;
/**
* Default values for template arguments
*/
using buffer_data_type_t = uint8_t;
const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer;
const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write;
/**
* PointerMapper
* Associates fake pointers with buffers.
*
*/
class PointerMapper {
public:
using base_ptr_t = std::intptr_t;
/* Structure of a virtual pointer
*
* |================================================|
* | POINTER ADDRESS |
* |================================================|
*/
struct virtual_pointer_t {
/* Type for the pointers
*/
base_ptr_t m_contents;
/** Conversions from virtual_pointer_t to
* void * should just reinterpret_cast the integer number
*/
operator void *() const { return reinterpret_cast<void *>(m_contents); }
/**
* Convert back to the integer number.
*/
operator base_ptr_t() const { return m_contents; }
/**
* Add a certain value to the pointer to create a
* new pointer to that offset
*/
virtual_pointer_t operator+(size_t off) { return m_contents + off; }
/* Numerical order for sorting pointers in containers. */
bool operator<(virtual_pointer_t rhs) const {
return (static_cast<base_ptr_t>(m_contents) <
static_cast<base_ptr_t>(rhs.m_contents));
}
bool operator>(virtual_pointer_t rhs) const {
return (static_cast<base_ptr_t>(m_contents) >
static_cast<base_ptr_t>(rhs.m_contents));
}
/**
* Numerical order for sorting pointers in containers
*/
bool operator==(virtual_pointer_t rhs) const {
return (static_cast<base_ptr_t>(m_contents) ==
static_cast<base_ptr_t>(rhs.m_contents));
}
/**
* Simple forward to the equality overload.
*/
bool operator!=(virtual_pointer_t rhs) const {
return !(this->operator==(rhs));
}
/**
* Converts a void * into a virtual pointer structure.
* Note that this will only work if the void * was
* already a virtual_pointer_t, but we have no way of
* checking
*/
virtual_pointer_t(const void *ptr)
: m_contents(reinterpret_cast<base_ptr_t>(ptr)){};
/**
* Creates a virtual_pointer_t from the given integer
* number
*/
virtual_pointer_t(base_ptr_t u) : m_contents(u){};
};
/* Definition of a null pointer
*/
const virtual_pointer_t null_virtual_ptr = nullptr;
/**
* Whether if a pointer is null or not.
* A pointer is nullptr if the value is of null_virtual_ptr
*/
static inline bool is_nullptr(virtual_pointer_t ptr) {
return (static_cast<void *>(ptr) == nullptr);
}
/* basic type for all buffers
*/
using buffer_t = cl::sycl::buffer<buffer_data_type_t>;
/**
* Node that stores information about a device allocation.
* Nodes are sorted by size to organise a free list of nodes
* that can be recovered.
*/
struct pMapNode_t {
buffer_t m_buffer;
size_t m_size;
bool m_free;
pMapNode_t(buffer_t b, size_t size, bool f)
: m_buffer{b}, m_size{size}, m_free{f} {
m_buffer.set_final_data(nullptr);
}
bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); }
};
/** Storage of the pointer / buffer tree
*/
using pointerMap_t = std::map<virtual_pointer_t, pMapNode_t>;
/**
* Obtain the insertion point in the pointer map for
* a pointer of the given size.
* \param requiredSize Size attempted to reclaim
*/
typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) {
typename pointerMap_t::iterator retVal;
bool reuse = false;
if (!m_freeList.empty()) {
// try to re-use an existing block
for (auto freeElem : m_freeList) {
if (freeElem->second.m_size >= requiredSize) {
retVal = freeElem;
reuse = true;
// Element is not going to be free anymore
m_freeList.erase(freeElem);
break;
}
}
}
if (!reuse) {
retVal = std::prev(m_pointerMap.end());
}
return retVal;
}
/**
* Returns an iterator to the node that stores the information
* of the given virtual pointer from the given pointer map structure.
* If pointer is not found, throws std::out_of_range.
* If the pointer map structure is empty, throws std::out_of_range
*
* \param pMap the pointerMap_t structure storing all the pointers
* \param virtual_pointer_ptr The virtual pointer to obtain the node of
* \throws std::out:of_range if the pointer is not found or pMap is empty
*/
typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) {
if (this->count() == 0) {
m_pointerMap.clear();
EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n"));
}
if (is_nullptr(ptr)) {
m_pointerMap.clear();
EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n"));
}
// The previous element to the lower bound is the node that
// holds this memory address
auto node = m_pointerMap.lower_bound(ptr);
// If the value of the pointer is not the one of the node
// then we return the previous one
if (node == std::end(m_pointerMap)) {
--node;
} else if (node->first != ptr) {
if (node == std::begin(m_pointerMap)) {
m_pointerMap.clear();
EIGEN_THROW_X(
std::out_of_range("The pointer is not registered in the map\n"));
}
--node;
}
return node;
}
/* get_buffer.
* Returns a buffer from the map using the pointer address
*/
template <typename buffer_data_type = buffer_data_type_t>
cl::sycl::buffer<buffer_data_type, 1> get_buffer(
const virtual_pointer_t ptr) {
auto node = get_node(ptr);
auto& map_node = node->second;
eigen_assert(node->first == ptr || node->first < ptr);
eigen_assert(ptr < static_cast<virtual_pointer_t>(map_node.m_size +
node->first));
return map_node.m_buffer.reinterpret<buffer_data_type>(
cl::sycl::range<1>{map_node.m_size / sizeof(buffer_data_type)});
}
/**
* @brief Returns an accessor to the buffer of the given virtual pointer
* @param accessMode
* @param accessTarget
* @param ptr The virtual pointer
*/
template <sycl_acc_mode access_mode = default_acc_mode,
sycl_acc_target access_target = default_acc_target,
typename buffer_data_type = buffer_data_type_t>
cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
get_access(const virtual_pointer_t ptr) {
auto buf = get_buffer<buffer_data_type>(ptr);
return buf.template get_access<access_mode, access_target>();
}
/**
* @brief Returns an accessor to the buffer of the given virtual pointer
* in the given command group scope
* @param accessMode
* @param accessTarget
* @param ptr The virtual pointer
* @param cgh Reference to the command group scope
*/
template <sycl_acc_mode access_mode = default_acc_mode,
sycl_acc_target access_target = default_acc_target,
typename buffer_data_type = buffer_data_type_t>
cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) {
auto buf = get_buffer<buffer_data_type>(ptr);
return buf.template get_access<access_mode, access_target>(cgh);
}
/*
* Returns the offset from the base address of this pointer.
*/
inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) {
// The previous element to the lower bound is the node that
// holds this memory address
auto node = get_node(ptr);
auto start = node->first;
eigen_assert(start == ptr || start < ptr);
eigen_assert(ptr < start + node->second.m_size);
return (ptr - start);
}
/*
* Returns the number of elements by which the given pointer is offset from
* the base address.
*/
template <typename buffer_data_type>
inline size_t get_element_offset(const virtual_pointer_t ptr) {
return get_offset(ptr) / sizeof(buffer_data_type);
}
/**
* Constructs the PointerMapper structure.
*/
PointerMapper(base_ptr_t baseAddress = 4096)
: m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} {
if (m_baseAddress == 0) {
EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n"));
}
};
/**
* PointerMapper cannot be copied or moved
*/
PointerMapper(const PointerMapper &) = delete;
/**
* Empty the pointer list
*/
inline void clear() {
m_freeList.clear();
m_pointerMap.clear();
}
/* add_pointer.
* Adds an existing pointer to the map and returns the virtual pointer id.
*/
inline virtual_pointer_t add_pointer(const buffer_t &b) {
return add_pointer_impl(b);
}
/* add_pointer.
* Adds a pointer to the map and returns the virtual pointer id.
*/
inline virtual_pointer_t add_pointer(buffer_t &&b) {
return add_pointer_impl(b);
}
/**
* @brief Fuses the given node with the previous nodes in the
* pointer map if they are free
*
* @param node A reference to the free node to be fused
*/
void fuse_forward(typename pointerMap_t::iterator &node) {
while (node != std::prev(m_pointerMap.end())) {
// if following node is free
// remove it and extend the current node with its size
auto fwd_node = std::next(node);
if (!fwd_node->second.m_free) {
break;
}
auto fwd_size = fwd_node->second.m_size;
m_freeList.erase(fwd_node);
m_pointerMap.erase(fwd_node);
node->second.m_size += fwd_size;
}
}
/**
* @brief Fuses the given node with the following nodes in the
* pointer map if they are free
*
* @param node A reference to the free node to be fused
*/
void fuse_backward(typename pointerMap_t::iterator &node) {
while (node != m_pointerMap.begin()) {
// if previous node is free, extend it
// with the size of the current one
auto prev_node = std::prev(node);
if (!prev_node->second.m_free) {
break;
}
prev_node->second.m_size += node->second.m_size;
// remove the current node
m_freeList.erase(node);
m_pointerMap.erase(node);
// point to the previous node
node = prev_node;
}
}
/* remove_pointer.
* Removes the given pointer from the map.
* The pointer is allowed to be reused only if ReUse if true.
*/
template <bool ReUse = true>
void remove_pointer(const virtual_pointer_t ptr) {
if (is_nullptr(ptr)) {
return;
}
auto node = this->get_node(ptr);
node->second.m_free = true;
m_freeList.emplace(node);
// Fuse the node
// with free nodes before and after it
fuse_forward(node);
fuse_backward(node);
// If after fusing the node is the last one
// simply remove it (since it is free)
if (node == std::prev(m_pointerMap.end())) {
m_freeList.erase(node);
m_pointerMap.erase(node);
}
}
/* count.
* Return the number of active pointers (i.e, pointers that
* have been malloc but not freed).
*/
size_t count() const { return (m_pointerMap.size() - m_freeList.size()); }
private:
/* add_pointer_impl.
* Adds a pointer to the map and returns the virtual pointer id.
* BufferT is either a const buffer_t& or a buffer_t&&.
*/
template <class BufferT>
virtual_pointer_t add_pointer_impl(BufferT b) {
virtual_pointer_t retVal = nullptr;
size_t bufSize = b.get_count() * sizeof(buffer_data_type_t);
auto byte_buffer =
b.template reinterpret<buffer_data_type_t>(cl::sycl::range<1>{bufSize});
pMapNode_t p{byte_buffer, bufSize, false};
// If this is the first pointer:
if (m_pointerMap.empty()) {
virtual_pointer_t initialVal{m_baseAddress};
m_pointerMap.emplace(initialVal, p);
return initialVal;
}
auto lastElemIter = get_insertion_point(bufSize);
// We are recovering an existing free node
if (lastElemIter->second.m_free) {
lastElemIter->second.m_buffer = b;
lastElemIter->second.m_free = false;
// If the recovered node is bigger than the inserted one
// add a new free node with the remaining space
if (lastElemIter->second.m_size > bufSize) {
// create a new node with the remaining space
auto remainingSize = lastElemIter->second.m_size - bufSize;
pMapNode_t p2{b, remainingSize, true};
// update size of the current node
lastElemIter->second.m_size = bufSize;
// add the new free node
auto newFreePtr = lastElemIter->first + bufSize;
auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first;
m_freeList.emplace(freeNode);
}
retVal = lastElemIter->first;
} else {
size_t lastSize = lastElemIter->second.m_size;
retVal = lastElemIter->first + lastSize;
m_pointerMap.emplace(retVal, p);
}
return retVal;
}
/**
* Compare two iterators to pointer map entries according to
* the size of the allocation on the device.
*/
struct SortBySize {
bool operator()(typename pointerMap_t::iterator a,
typename pointerMap_t::iterator b) const {
return ((a->first < b->first) && (a->second <= b->second)) ||
((a->first < b->first) && (b->second <= a->second));
}
};
/* Maps the pointer addresses to buffer and size pairs.
*/
pointerMap_t m_pointerMap;
/* List of free nodes available for re-using
*/
std::set<typename pointerMap_t::iterator, SortBySize> m_freeList;
/* Base address used when issuing the first virtual pointer, allows users
* to specify alignment. Cannot be zero. */
std::intptr_t m_baseAddress;
};
/* remove_pointer.
* Removes the given pointer from the map.
* The pointer is allowed to be reused only if ReUse if true.
*/
template <>
inline void PointerMapper::remove_pointer<false>(const virtual_pointer_t ptr) {
if (is_nullptr(ptr)) {
return;
}
m_pointerMap.erase(this->get_node(ptr));
}
/**
* Malloc-like interface to the pointer-mapper.
* Given a size, creates a byte-typed buffer and returns a
* fake pointer to keep track of it.
* \param size Size in bytes of the desired allocation
* \throw cl::sycl::exception if error while creating the buffer
*/
inline void *SYCLmalloc(size_t size, PointerMapper &pMap) {
if (size == 0) {
return nullptr;
}
// Create a generic buffer of the given size
using buffer_t = cl::sycl::buffer<buffer_data_type_t, 1>;
auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size}));
// Store the buffer on the global list
return static_cast<void *>(thePointer);
}
/**
* Free-like interface to the pointer mapper.
* Given a fake-pointer created with the virtual-pointer malloc,
* destroys the buffer and remove it from the list.
* If ReUse is false, the pointer is not added to the freeList,
* it should be false only for sub-buffers.
*/
template <bool ReUse = true, typename PointerMapper>
inline void SYCLfree(void *ptr, PointerMapper &pMap) {
pMap.template remove_pointer<ReUse>(ptr);
}
/**
* Clear all the memory allocated by SYCL.
*/
template <typename PointerMapper>
inline void SYCLfreeAll(PointerMapper &pMap) {
pMap.clear();
}
template <cl::sycl::access::mode AcMd, typename T>
struct RangeAccess {
static const auto global_access = cl::sycl::access::target::global_buffer;
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
typedef T scalar_t;
typedef scalar_t &ref_t;
typedef scalar_t *ptr_t;
// the accessor type does not necessarily the same as T
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
accessor;
typedef RangeAccess<AcMd, T> self_t;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access,
size_t offset,
std::intptr_t virtual_ptr)
: access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {}
RangeAccess(cl::sycl::buffer<scalar_t, 1> buff =
cl::sycl::buffer<scalar_t, 1>(cl::sycl::range<1>(1)))
: access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {}
// This should be only used for null constructor on the host side
RangeAccess(std::nullptr_t) : RangeAccess() {}
// This template parameter must be removed and scalar_t should be replaced
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
typedef cl::sycl::multi_ptr<scalar_t,
cl::sycl::access::address_space::generic_space,
cl::sycl::access::decorated::no>
multi_ptr;
multi_ptr p(access_);
return (p + offset_).get_raw();
}
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
offset_ += (offset);
return *this;
}
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const {
return self_t(access_, offset_ + offset, virtual_ptr_);
}
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const {
return self_t(access_, offset_ - offset, virtual_ptr_);
}
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) {
offset_ -= offset;
return *this;
}
// THIS IS FOR NULL COMPARISON ONLY
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
const RangeAccess &lhs, std::nullptr_t) {
return ((lhs.virtual_ptr_ == -1));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
const RangeAccess &lhs, std::nullptr_t i) {
return !(lhs == i);
}
// THIS IS FOR NULL COMPARISON ONLY
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
std::nullptr_t, const RangeAccess &rhs) {
return ((rhs.virtual_ptr_ == -1));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
std::nullptr_t i, const RangeAccess &rhs) {
return !(i == rhs);
}
// Prefix operator (Increment and return value)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() {
offset_++;
return (*this);
}
// Postfix operator (Return value and increment)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) {
EIGEN_UNUSED_VARIABLE(i);
self_t temp_iterator(*this);
offset_++;
return temp_iterator;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const {
return (access_.get_count() - offset_);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const {
return offset_;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) {
offset_ = offset;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const {
return *get_pointer();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() {
return *get_pointer();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) {
return *(get_pointer() + x);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const {
return *(get_pointer() + x);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const {
return reinterpret_cast<scalar_t *>(virtual_ptr_ +
(offset_ * sizeof(scalar_t)));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const {
return (virtual_ptr_ != -1);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess<AcMd, const T>() {
return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
operator RangeAccess<AcMd, const T>() const {
return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
}
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
cl::sycl::handler &cgh) const {
cgh.require(access_);
}
private:
accessor access_;
size_t offset_;
std::intptr_t virtual_ptr_; // the location of the buffer in the map
};
template <cl::sycl::access::mode AcMd, typename T>
struct RangeAccess<AcMd, const T> : RangeAccess<AcMd, T> {
typedef RangeAccess<AcMd, T> Base;
using Base::Base;
};
} // namespace internal
} // namespace TensorSycl
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H

View File

@ -531,12 +531,12 @@
#error "NVCC as the target platform for HIPCC is currently not supported."
#endif
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
// Means the compiler is either nvcc or clang with CUDA enabled
#define EIGEN_CUDACC __CUDACC__
#endif
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
// Means we are generating code for the device
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
@ -548,7 +548,7 @@
#define EIGEN_CUDA_SDK_VER 0
#endif
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
// Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
#define EIGEN_HIPCC __HIPCC__
@ -557,7 +557,7 @@
// ++ host_defines.h which contains the defines for the __host__ and __device__ macros
#include <hip/hip_runtime.h>
#if defined(__HIP_DEVICE_COMPILE__)
#if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
// analogous to EIGEN_CUDA_ARCH, but for HIP
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
#endif

View File

@ -2,17 +2,13 @@
// A Scalar with internal representation T+1 so that zero is internally
// represented by T(1). This is used to test memory fill.
//
#pragma once
template<typename T>
class OffByOneScalar {
public:
OffByOneScalar() : val_(1) {}
OffByOneScalar(const OffByOneScalar& other) {
*this = other;
}
OffByOneScalar& operator=(const OffByOneScalar& other) {
val_ = other.val_;
return *this;
}
OffByOneScalar(const OffByOneScalar& other) = default;
OffByOneScalar& operator=(const OffByOneScalar& other) = default;
OffByOneScalar(T val) : val_(val + 1) {}
OffByOneScalar& operator=(T val) {

View File

@ -128,12 +128,6 @@ struct TensorEvaluator<const TensorIndexPairOp<ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
};
@ -278,12 +272,6 @@ struct TensorEvaluator<const TensorPairReducerOp<ReduceOp, Dims, ArgType>, Devic
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_orig_impl.bind(cgh);
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {

View File

@ -228,14 +228,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
block.cleanup();
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_leftImpl.bind(cgh);
m_rightImpl.bind(cgh);
}
#endif
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
private:

View File

@ -687,13 +687,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
Broadcast functor() const { return m_broadcast; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
cl::sycl::handler& cgh) const {
m_impl.bind(cgh);
}
#endif
private:
static constexpr bool IsColMajor =
static_cast<int>(Layout) == static_cast<int>(ColMajor);

View File

@ -355,12 +355,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
return NULL;
}
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const

View File

@ -279,14 +279,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_leftImpl.bind(cgh);
m_rightImpl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;

View File

@ -61,13 +61,6 @@ struct CoeffLoader {
return m_tensor.template packet<LoadMode>(index);
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_tensor.bind(cgh);
}
#endif
private:
const Tensor m_tensor;
};
@ -97,12 +90,6 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
private:
typedef typename Tensor::Scalar Scalar;
@ -250,13 +237,6 @@ class SimpleTensorContractionMapper {
return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_tensor.bind(cgh);
}
#endif
const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
return m_tensor;
}
@ -508,13 +488,6 @@ class TensorContractionSubMapper {
return false;
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_base_mapper.bind(cgh);
}
#endif
const ParentMapper& base_mapper() const { return m_base_mapper; }
Index vert_offset() const { return m_vert_offset; }
Index horiz_offset() const { return m_horiz_offset; }

View File

@ -627,7 +627,7 @@ class TensorContractionKernel {
ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
(thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
: compute_panel<false>(itemID, thread_properties, out_ptr);
@ -1048,7 +1048,7 @@ struct GeneralVectorTensor {
is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
const StorageIndex contractGroupId =
is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);
const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
@ -1255,8 +1255,8 @@ struct GeneralScalarContraction {
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
auto out_ptr = out_res.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();
auto out_ptr = out_res;
OutScalar * scratch_ptr = scratch.get_pointer();
StorageIndex globalid = itemID.get_global_id(0);
StorageIndex localid = itemID.get_local_id(0);
@ -1395,13 +1395,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
typedef internal::TensorContractionInputMapper<
LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
right_nocontract_t, contract_t,
PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
rhs_inner_dim_reordered, Unaligned, MakePointer>
RhsMapper;
// initialize data mappers
@ -1505,8 +1505,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
PacketAccess, input_mapper_properties, true, ct>
ContractKernelName;
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim).wait();
} else {
typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
@ -1518,7 +1518,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
triple_dim);
triple_dim).wait();
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
auto op = Op();
@ -1531,8 +1531,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
cl::sycl::range<1>(localRange)),
StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK).wait();
device().deallocate_temp(temp_pointer);
}
}
@ -1566,28 +1565,28 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
EvaluatorPointerType, Op>
ReductionKernel;
device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
tmp_global_accessor, buffer,
cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
cl::sycl::range<1>(localRange)),
StorageIndex(1), Op(), nonContractDim, cNumGroups);
StorageIndex(1), Op(), nonContractDim, cNumGroups).wait();
device().deallocate_temp(temp_pointer);
} else {
typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
TensorMapper, StorageIndex, Properties, CFactor, false,
is_lhs_vec, true>
ContractKernelName;
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
}
}
#endif
@ -1616,19 +1615,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
thread_range, local_range, K);
thread_range, local_range, K).wait();
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
EvaluatorPointerType, StorageIndex, local_range>
GenericRKernel;
device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
tmp_global_accessor, buffer,
cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()).wait();
device().deallocate_temp(temp_pointer);
} else {
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
local_range, K);
local_range, K).wait();
}
}
#endif
@ -1642,12 +1640,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
this->m_result = NULL;
}
}
// The placeholder accessors must bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
this->m_leftImpl.bind(cgh);
this->m_rightImpl.bind(cgh);
this->m_result.bind(cgh);
}
};
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H

View File

@ -442,12 +442,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
/// required by sycl in order to extract the sycl accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;

View File

@ -61,8 +61,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
return (boolean_check[0] && boolean_check[1]);
}
void operator()(cl::sycl::nd_item<2> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
auto buffer_ptr = buffer_acc;
auto kernel_ptr = kernel_filter;
// the required row to be calculated for the for each plane in shered memory
const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
@ -128,8 +128,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
}
void operator()(cl::sycl::nd_item<3> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
auto buffer_ptr = buffer_acc;
auto kernel_ptr = kernel_filter;
// the required row to be calculated for the for each plane in shered memory
const auto num_input = cl::sycl::range<2>{
(cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
@ -216,8 +216,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
void operator()(cl::sycl::nd_item<3> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
auto buffer_ptr = buffer_acc;
auto kernel_ptr = kernel_filter;
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
@ -411,7 +411,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])).wait();
break;
}
@ -442,7 +442,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
ConvKernel;
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}).wait();
break;
}
@ -482,7 +482,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
ConvKernel;
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP).wait();
break;
}
@ -519,13 +519,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
}
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_kernelImpl.bind(cgh);
m_inputImpl.bind(cgh);
m_buf.bind(cgh);
m_kernel.bind(cgh);
}
private:
// No assignment (copies are needed by the kernels)

View File

@ -151,13 +151,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_result.bind(cgh);
}
#endif
protected:
void evalTo(EvaluatorPointerType data) {
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
@ -324,12 +317,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_result.bind(cgh);
}
#endif
protected:
void evalTo(EvaluatorPointerType data) {

View File

@ -31,7 +31,8 @@ struct SyclDeviceInfo {
.template get_info<cl::sycl::info::device::local_mem_type>()),
max_work_item_sizes(
queue.get_device()
.template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
.template get_info<
cl::sycl::info::device::max_work_item_sizes<3>>()),
max_mem_alloc_size(
queue.get_device()
.template get_info<
@ -69,7 +70,6 @@ struct SyclDeviceInfo {
} // end namespace internal
} // end namespace TensorSycl
typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
// TensorFlow via the Eigen SYCL Backend.
@ -110,147 +110,52 @@ class QueueInterface {
explicit QueueInterface(
const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
unsigned num_threads = std::thread::hardware_concurrency())
: m_queue(dev_or_sel, handler),
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
m_prog(m_queue.get_context(), get_sycl_supported_devices()),
#endif
: m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
m_thread_pool(num_threads),
m_device_info(m_queue) {
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
m_prog.build_with_kernel_type<DeviceOrSelector>();
auto f = [&](cl::sycl::handler &cgh) {
cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
[=]() {})
};
EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
#endif
}
m_device_info(m_queue) {}
template <typename DeviceOrSelector>
explicit QueueInterface(
const DeviceOrSelector &dev_or_sel,
unsigned num_threads = std::thread::hardware_concurrency())
: QueueInterface(dev_or_sel,
[this](cl::sycl::exception_list l) {
this->exception_caught_ = this->sycl_async_handler(l);
},
num_threads) {}
: QueueInterface(
dev_or_sel,
[this](cl::sycl::exception_list l) {
this->exception_caught_ = this->sycl_async_handler(l);
},
num_threads) {}
explicit QueueInterface(
const cl::sycl::queue& q, unsigned num_threads = std::thread::hardware_concurrency())
: m_queue(q),
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
m_prog(m_queue.get_context(), get_sycl_supported_devices()),
#endif
m_thread_pool(num_threads),
m_device_info(m_queue) {}
const cl::sycl::queue &q,
unsigned num_threads = std::thread::hardware_concurrency())
: m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
#endif
/// Attach an existing buffer to the pointer map, Eigen will not reuse it
EIGEN_STRONG_INLINE void *attach_buffer(
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return static_cast<void *>(pMapper.add_pointer(buf));
}
/// Detach previously attached buffer
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
TensorSycl::internal::SYCLfree<false>(p, pMapper);
}
/// Allocating device pointer. This pointer is actually an 8 bytes host
/// pointer used as key to access the sycl device buffer. The reason is that
/// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
/// expressions. So we create a key pointer to be used in Eigen expression
/// construction. When we convert the Eigen construction into the sycl
/// construction we use this pointer as a key in our buffer_map and we make
/// sure that we dedicate only one buffer only for this pointer. The device
/// pointer would be deleted by calling deallocate function.
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
#if EIGEN_MAX_ALIGN_BYTES > 0
size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
if (align > 0) {
num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
}
return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES,
num_bytes, m_queue);
#else
return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
#endif
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
}
EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
#if EIGEN_MAX_ALIGN_BYTES > 0
size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
if (align > 0) {
num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
}
#endif
std::lock_guard<std::mutex> lock(pmapper_mutex_);
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
if (scratch_buffers.empty()) {
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
;
} else {
for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
auto buff = pMapper.get_buffer(*it);
if (buff.get_size() >= num_bytes) {
auto ptr = *it;
scratch_buffers.erase(it);
return ptr;
} else {
++it;
}
}
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
}
#else
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
#endif
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, data_t>
get(data_t *data) const {
return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
data_t>
data) const {
return static_cast<data_t *>(data.get_virtual_pointer());
return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
}
EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
scratch_buffers.insert(p);
#else
TensorSycl::internal::SYCLfree(p, pMapper);
#endif
}
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE void deallocate_temp(
const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
deallocate_temp(p.get_virtual_pointer());
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
return data;
}
EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const {
deallocate_temp(const_cast<void *>(p));
}
/// This is used to deallocate the device pointer. p is used as a key inside
/// the map to find the device buffer and delete it.
EIGEN_STRONG_INLINE void deallocate(void *p) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
TensorSycl::internal::SYCLfree(p, pMapper);
}
EIGEN_STRONG_INLINE void deallocate_all() const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
TensorSycl::internal::SYCLfreeAll(pMapper);
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
scratch_buffers.clear();
#endif
cl::sycl::free(p, m_queue);
}
/// The memcpyHostToDevice is used to copy the data from host to device
@ -260,24 +165,7 @@ class QueueInterface {
EIGEN_STRONG_INLINE void memcpyHostToDevice(
void *dst, const void *src, size_t n,
std::function<void()> callback) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
static const auto global_access = cl::sycl::access::target::global_buffer;
typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
write_accessor;
if (n == 0) {
if (callback) callback();
return;
}
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) {
write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
auto non_deleter = [](buffer_scalar_t const *) {};
std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
cgh.copy(s_ptr, dst_acc);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
auto e = m_queue.memcpy(dst, src, n);
synchronize_and_callback(e, callback);
}
@ -288,24 +176,11 @@ class QueueInterface {
EIGEN_STRONG_INLINE void memcpyDeviceToHost(
void *dst, const void *src, size_t n,
std::function<void()> callback) const {
static const auto read_mode = cl::sycl::access::mode::read;
static const auto global_access = cl::sycl::access::target::global_buffer;
typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
read_accessor;
if (n == 0) {
if (callback) callback();
return;
}
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) {
read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
auto non_deleter = [](buffer_scalar_t *) {};
std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
cgh.copy(src_acc, s_ptr);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
auto e = m_queue.memcpy(dst, src, n);
synchronize_and_callback(e, callback);
}
@ -313,257 +188,88 @@ class QueueInterface {
/// No callback is required here as both arguments are on the device
/// and SYCL can handle the dependency.
EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
static const auto read_mode = cl::sycl::access::mode::read;
static const auto write_mode = cl::sycl::access::mode::discard_write;
if (n == 0) {
return;
}
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) {
auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
cgh.copy(src_acc, dst_acc);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
m_queue.memcpy(dst, src, n).wait();
}
/// the memset function.
/// No callback is required here as both arguments are on the device
/// and SYCL can handle the dependency.
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
if (n == 0) {
return;
}
auto f = [&](cl::sycl::handler &cgh) {
// Get a typed range accesser to ensure we fill each byte, in case
// `buffer_scalar_t` is not (u)int8_t.
auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
cgh.fill(dst_acc, static_cast<uint8_t>(c));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
m_queue.memset(data, c, n).wait();
}
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
template <typename T>
EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
if (begin == end) {
return;
}
const ptrdiff_t count = end - begin;
auto f = [&](cl::sycl::handler &cgh) {
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
cgh.fill(dst_acc, value);
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
}
/// Get a range accessor to the virtual pointer's device memory. This range
/// accessor will allow access to the memory from the pointer to the end of
/// the buffer.
///
/// NOTE: Inside a kernel the range accessor will always be indexed from the
/// start of the buffer, so the offset in the accessor is only used by
/// methods like handler::copy and will not be available inside a kernel.
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
get_range_accessor(const void *ptr) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto original_buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_offset = offset / sizeof(T);
eigen_assert(typed_offset >= 0);
const auto typed_size = original_buffer.get_size() / sizeof(T);
auto buffer = original_buffer.template reinterpret<
std::remove_const_t<T>>(
cl::sycl::range<1>(typed_size));
const ptrdiff_t size = buffer.get_count() - typed_offset;
eigen_assert(size >= 0);
typedef cl::sycl::accessor<std::remove_const_t<T>,
1, AcMd, global_access, is_place_holder>
placeholder_accessor_t;
const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
cl::sycl::id<1>(typed_offset)),
static_cast<size_t>(typed_offset),
reinterpret_cast<std::intptr_t>(start_ptr));
}
/// Get a range accessor to the virtual pointer's device memory with a
/// specified size.
template <cl::sycl::access::mode AcMd, typename Index>
EIGEN_STRONG_INLINE cl::sycl::accessor<
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
const Index n_bytes) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
eigen_assert(n_bytes >= 0);
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset >= 0);
eigen_assert(offset + n_bytes <= buffer.get_size());
return buffer.template get_access<AcMd, global_access>(
cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
}
/// Get a range accessor to the virtual pointer's device memory with a
/// specified type and count.
template <cl::sycl::access::mode AcMd, typename T, typename Index>
EIGEN_STRONG_INLINE cl::sycl::accessor<
T, 1, AcMd, cl::sycl::access::target::global_buffer>
get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
const Index count) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
eigen_assert(count >= 0);
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset >= 0);
// Technically we should create a subbuffer for the desired range,
// then reinterpret that. However, I was not able to get changes to reflect
// in the original buffer (only the subbuffer and reinterpretted buffer).
// This current implementation now has the restriction that the buffer
// offset and original buffer size must be a multiple of sizeof(T).
// Note that get_range_accessor(void*) currently has the same restriction.
//
// auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer,
// cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_offset = offset / sizeof(T);
const size_t typed_size = buffer.get_size() / sizeof(T);
auto reint = buffer.template reinterpret<
std::remove_const_t<T>>(
cl::sycl::range<1>(typed_size));
return reint.template get_access<AcMd, global_access>(
cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
}
/// Creation of sycl accessor for a buffer. This function first tries to find
/// the buffer in the buffer_map. If found it gets the accessor from it, if
/// not, the function then adds an entry by creating a sycl buffer for that
/// particular pointer.
template <cl::sycl::access::mode AcMd>
EIGEN_STRONG_INLINE cl::sycl::accessor<
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return pMapper.get_buffer(ptr)
.template get_access<AcMd, cl::sycl::access::target::global_buffer>(
cgh);
}
EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
const void *ptr) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return pMapper.get_buffer(ptr);
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
std::lock_guard<std::mutex> lock(pmapper_mutex_);
return pMapper.get_offset(ptr);
const size_t count = end - begin;
m_queue.fill(begin, value, count).wait();
}
template <typename OutScalar, typename sycl_kernel, typename Lhs,
typename Rhs, typename OutPtr, typename Range, typename Index,
typename... T>
EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
const Rhs &rhs, OutPtr outptr,
Range thread_range,
Index scratchSize,
T... var) const {
EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(
const Lhs &lhs, const Rhs &rhs, OutPtr outptr, Range thread_range,
Index scratchSize, T... var) const {
auto kernel_functor = [=](cl::sycl::handler &cgh) {
// binding the placeholder accessors to a commandgroup handler
lhs.bind(cgh);
rhs.bind(cgh);
outptr.bind(cgh);
typedef cl::sycl::accessor<OutScalar, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
cgh.parallel_for(
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program().template get_kernel<sycl_kernel>(),
#endif
thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
cgh.parallel_for(thread_range,
sycl_kernel(scratch, lhs, rhs, outptr, var...));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
async_synchronize(e);
return m_queue.submit(kernel_functor);
}
template <typename OutScalar, typename sycl_kernel, typename InPtr,
typename OutPtr, typename Range, typename Index, typename... T>
EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
OutPtr &outptr,
Range thread_range,
Index scratchSize,
T... var) const {
EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr,
OutPtr &outptr,
Range thread_range,
Index scratchSize,
T... var) const {
auto kernel_functor = [=](cl::sycl::handler &cgh) {
// binding the placeholder accessors to a commandgroup handler
inptr.bind(cgh);
outptr.bind(cgh);
typedef cl::sycl::accessor<OutScalar, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
cgh.parallel_for(
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program().template get_kernel<sycl_kernel>(),
#endif
thread_range, sycl_kernel(scratch, inptr, outptr, var...));
cgh.parallel_for(thread_range,
sycl_kernel(scratch, inptr, outptr, var...));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
async_synchronize(e);
return m_queue.submit(kernel_functor);
}
template <typename OutScalar, typename sycl_kernel, typename InPtr,
typename Range, typename Index, typename... T>
EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
Range thread_range,
Index scratchSize,
T... var) const {
template <typename OutScalar, typename sycl_kernel, typename InPtr,
typename Range, typename Index, typename... T>
EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(
const InPtr &inptr, Range thread_range, Index scratchSize,
T... var) const {
auto kernel_functor = [=](cl::sycl::handler &cgh) {
// binding the placeholder accessors to a commandgroup handler
inptr.bind(cgh);
typedef cl::sycl::accessor<OutScalar, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
cgh.parallel_for(
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
program().template get_kernel<sycl_kernel>(),
#endif
thread_range, sycl_kernel(scratch, inptr, var...));
cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
};
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
async_synchronize(e);
}
return m_queue.submit(kernel_functor);
}
EIGEN_STRONG_INLINE void synchronize() const {
#ifdef EIGEN_EXCEPTIONS
@ -573,14 +279,6 @@ class QueueInterface {
#endif
}
EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
set_latest_event(e);
#ifndef EIGEN_SYCL_ASYNC_EXECUTION
synchronize();
#endif
}
template <typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
Index &rng, Index &GRange) const {
@ -776,48 +474,9 @@ class QueueInterface {
return !exception_caught_;
}
EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
std::lock_guard<std::mutex> lock(event_mutex_);
return latest_events_[std::this_thread::get_id()];
#else
eigen_assert(false);
return cl::sycl::event();
#endif
}
// destructor
~QueueInterface() {
pMapper.clear();
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
scratch_buffers.clear();
#endif
}
template <typename T>
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> get_null_accessor()
const {
eigen_assert(null_buff_simulator.get_size() % sizeof(T) == 0 && "The null buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_size = null_buff_simulator.get_size() / sizeof(T);
eigen_assert(typed_size > 0);
auto typed_null_buff =
null_buff_simulator.template reinterpret<T>(cl::sycl::range<1>(typed_size));
return TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>(typed_null_buff);
}
protected:
EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
std::lock_guard<std::mutex> lock(event_mutex_);
latest_events_[std::this_thread::get_id()] = e;
#else
EIGEN_UNUSED_VARIABLE(e);
#endif
}
void synchronize_and_callback(cl::sycl::event e,
const std::function<void()> &callback) const {
set_latest_event(e);
if (callback) {
auto callback_ = [=]() {
#ifdef EIGEN_EXCEPTIONS
@ -850,29 +509,8 @@ class QueueInterface {
/// class members:
bool exception_caught_ = false;
mutable std::mutex pmapper_mutex_;
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
mutable std::mutex event_mutex_;
mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
#endif
/// std::map is the container used to make sure that we create only one buffer
/// per pointer. The lifespan of the buffer now depends on the lifespan of
/// SyclDevice. If a non-read-only pointer is needed to be accessed on the
/// host we should manually deallocate it.
mutable TensorSycl::internal::PointerMapper pMapper;
cl::sycl::buffer<uint8_t, 1> null_buff_simulator = cl::sycl::buffer<uint8_t, 1>(cl::sycl::range<1>(128));
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
mutable std::unordered_set<void *> scratch_buffers;
#endif
/// sycl queue
mutable cl::sycl::queue m_queue;
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
mutable cl::sycl::program m_prog;
#endif
/// The thread pool is used to wait on events and call callbacks
/// asynchronously
mutable Eigen::ThreadPool m_thread_pool;
@ -896,32 +534,6 @@ struct SyclDeviceBase {
struct SyclDevice : public SyclDeviceBase {
explicit SyclDevice(const QueueInterface *queue_stream)
: SyclDeviceBase(queue_stream) {}
template <typename scalar_t>
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, scalar_t>
get_null_accessor() const {
return queue_stream()->template get_null_accessor<scalar_t>();
}
// this is the accessor used to construct the evaluator
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
get_range_accessor(const void *ptr) const {
return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
}
// get sycl accessor
template <cl::sycl::access::mode AcMd>
EIGEN_STRONG_INLINE cl::sycl::accessor<
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
}
/// Accessing the created sycl device buffer for the device pointer
EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
const void *ptr) const {
return queue_stream()->get_sycl_buffer(ptr);
}
/// This is used to prepare the number of threads and also the number of
/// threads per block for sycl kernels
@ -966,40 +578,14 @@ struct SyclDevice : public SyclDeviceBase {
EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
queue_stream()->deallocate_temp(buffer);
}
template <cl::sycl::access::mode AcMd, typename T>
EIGEN_STRONG_INLINE void deallocate_temp(
const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const {
queue_stream()->deallocate_temp(buffer);
}
EIGEN_STRONG_INLINE void deallocate_all() const {
queue_stream()->deallocate_all();
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, data_t>
get(data_t *data) const {
return queue_stream()->get(data);
}
template <typename data_t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
data_t>
data) const {
return queue_stream()->get(data);
}
/// attach existing buffer
EIGEN_STRONG_INLINE void *attach_buffer(
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
return queue_stream()->attach_buffer(buf);
}
/// detach buffer
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
queue_stream()->detach_buffer(p);
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
return queue_stream()->get_offset(ptr);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
return data;
}
// some runtime conditions that can be applied here
@ -1029,19 +615,14 @@ struct SyclDevice : public SyclDeviceBase {
queue_stream()->memset(data, c, n);
}
/// the fill function
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
template <typename T>
EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
queue_stream()->fill(begin, end, value);
}
/// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
return queue_stream()->sycl_queue();
}
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
EIGEN_STRONG_INLINE cl::sycl::program &program() const {
return queue_stream()->program();
}
#endif
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
@ -1081,13 +662,6 @@ struct SyclDevice : public SyclDeviceBase {
EIGEN_STRONG_INLINE void synchronize() const {
queue_stream()->synchronize();
}
EIGEN_STRONG_INLINE void async_synchronize(
cl::sycl::event e = cl::sycl::event()) const {
queue_stream()->async_synchronize(e);
}
EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
return queue_stream()->get_latest_event();
}
// This function checks if the runtime recorded an error for the
// underlying stream device.
@ -1109,20 +683,20 @@ struct SyclDevice : public SyclDeviceBase {
return queue_stream()->getDeviceVendor();
}
template <typename OutScalar, typename KernelType, typename... T>
EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
var...);
EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
return queue_stream()
->template binary_kernel_launcher<OutScalar, KernelType>(var...);
}
template <typename OutScalar, typename KernelType, typename... T>
EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
var...);
EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
return queue_stream()
->template unary_kernel_launcher<OutScalar, KernelType>(var...);
}
template <typename OutScalar, typename KernelType, typename... T>
EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
var...);
EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
return queue_stream()
->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
}
};
} // end namespace Eigen

View File

@ -217,13 +217,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
ArgType expression() const { return m_expression; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_buffer.bind(cgh);
}
#endif
private:

View File

@ -183,12 +183,6 @@ struct TensorEvaluator
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
protected:
EvaluatorPointerType m_data;
Dimensions m_dims;
@ -215,13 +209,7 @@ Eigen::half loadConstant(const Eigen::half* address) {
return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
}
#endif
#ifdef EIGEN_USE_SYCL
// overload of load constant should be implemented here based on range access
template <cl::sycl::access::mode AcMd, typename T>
T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
return *address;
}
#endif
} // namespace internal
// Default evaluator for rvalues
@ -338,12 +326,7 @@ struct TensorEvaluator<const Derived, Device>
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
protected:
EvaluatorPointerType m_data;
Dimensions m_dims;
@ -425,13 +408,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_argImpl.bind(cgh);
}
#endif
private:
const NullaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
@ -538,14 +514,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
m_argImpl.bind(cgh);
}
#endif
private:
const Device EIGEN_DEVICE_REF m_device;
const UnaryOp m_functor;
@ -678,13 +646,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_leftImpl.bind(cgh);
m_rightImpl.bind(cgh);
}
#endif
private:
const Device EIGEN_DEVICE_REF m_device;
const BinaryOp m_functor;
@ -793,15 +754,6 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_arg1Impl.bind(cgh);
m_arg2Impl.bind(cgh);
m_arg3Impl.bind(cgh);
}
#endif
private:
const TernaryOp m_functor;
TensorEvaluator<Arg1Type, Device> m_arg1Impl;

View File

@ -743,7 +743,7 @@ class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
evaluator,
cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
cl::sycl::range<1>(tileSize)),
Index(1), range);
Index(1), range).wait();
}
evaluator.cleanup();
}

View File

@ -207,12 +207,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {

View File

@ -219,13 +219,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
EvaluatorPointerType data() const { return m_buffer; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_buffer.bind(cgh);
m_impl.bind(cgh);
}
#endif
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;

View File

@ -107,22 +107,6 @@ struct GpuDevice;
struct SyclDevice;
#ifdef EIGEN_USE_SYCL
template <typename T> struct MakeSYCLPointer {
typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
};
template <typename T>
EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
return data;
}
template <typename T>
struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
template <typename T>
struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
namespace TensorSycl {
namespace internal{
template <typename Evaluator, typename Op> class GenericNondeterministicReducer;

View File

@ -266,11 +266,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void extract_coordinates(Index index, array<Index, NumDims>& coords) const {

View File

@ -507,14 +507,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }

View File

@ -228,13 +228,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;

View File

@ -123,13 +123,6 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
}
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

View File

@ -221,12 +221,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
NewDimensions m_dimensions;
@ -655,12 +649,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
return NULL;
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
@ -1004,12 +992,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
return NULL;
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{

View File

@ -485,13 +485,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
private:
struct BlockIteratorState {
BlockIteratorState()

View File

@ -269,13 +269,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;

View File

@ -541,24 +541,6 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
template<typename ArgType, typename Device>
struct TensorReductionEvaluatorBase;
namespace internal {
namespace reduction {
template <typename CoeffReturnType, typename Device>
EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
typename std::enable_if<Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type& dev) {
return (dev.template get_null_accessor<CoeffReturnType>());
}
template <typename CoeffReturnType, typename Device>
EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
typename std::enable_if<!Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type&) {
return NULL;
}
}// end namespace reduction
} // end namespace internal
// Eval as rvalue
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
@ -623,7 +605,7 @@ static constexpr bool RunningOnGPU = false;
EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
: m_impl(op.expression(), device),
m_reducer(op.reducer()),
m_result(internal::reduction::get_null_value<CoeffReturnType, Device>(device)),
m_result(NULL),
m_device(device) {
EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@ -911,13 +893,6 @@ static constexpr bool RunningOnGPU = false;
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_result.bind(cgh);
}
#endif
private:
template <int, typename, typename> friend struct internal::GenericDimReducer;

View File

@ -96,8 +96,8 @@ struct SecondStepFullReducer {
// algorithm must be changed if the number of reduce per thread in the
// second step is greater than 1. Otherwise, the result will be wrong.
const Index localid = itemID.get_local_id(0);
auto aInPtr = aI.get_pointer() + localid;
auto aOutPtr = outAcc.get_pointer();
auto aInPtr = aI + localid;
auto aOutPtr = outAcc;
CoeffReturnType *scratchptr = scratch.get_pointer();
CoeffReturnType accumulator = *aInPtr;
@ -146,7 +146,7 @@ class FullReductionKernelFunctor {
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
const cl::sycl::nd_item<1> &itemID) const {
auto output_ptr = final_output.get_pointer();
auto output_ptr = final_output;
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
Index globalid = itemID.get_global_id(0);
Index localid = itemID.get_local_id(0);
@ -185,7 +185,7 @@ class FullReductionKernelFunctor {
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
const cl::sycl::nd_item<1> &itemID) const {
auto output_ptr = final_output.get_pointer();
auto output_ptr = final_output;
Index globalid = itemID.get_global_id(0);
Index localid = itemID.get_local_id(0);
// vectorizable parts
@ -232,7 +232,7 @@ class GenericNondeterministicReducer {
//This is to bypass the statefull condition in Eigen meanReducer
Op non_const_functor;
std::memcpy(&non_const_functor, &functor, sizeof (Op));
auto output_accessor_ptr = output_accessor.get_pointer();
auto output_accessor_ptr = output_accessor;
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
if (globalid < range) {
CoeffReturnType accum = functor.initialize();
@ -313,9 +313,9 @@ struct PartialReductionKernel {
Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
auto scratchPtr = scratch.get_pointer().get();
CoeffReturnType* scratchPtr = scratch.get_pointer();
auto outPtr =
output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
CoeffReturnType accumulator = op.initialize();
element_wise_reduce(globalRId, globalPId, accumulator);
@ -387,7 +387,7 @@ struct SecondStepPartialReduction {
if (globalId >= num_coeffs_to_preserve) return;
auto in_ptr = input_accessor.get_pointer() + globalId;
auto in_ptr = input_accessor + globalId;
OutScalar accumulator = op.initialize();
// num_coeffs_to_reduce is not bigger that 256
@ -395,7 +395,7 @@ struct SecondStepPartialReduction {
op.reduce(*in_ptr, &accumulator);
in_ptr += num_coeffs_to_preserve;
}
output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
output_accessor[globalId] = op.finalize(accumulator);
}
}; // namespace internal
@ -453,21 +453,18 @@ struct PartialReducerLauncher {
EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
num_coeffs_to_reduce);
num_coeffs_to_reduce).wait();
typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
SecondStepPartialReductionKernel;
dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
temp_accessor, output,
cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
reducer, num_coeffs_to_preserve, rNumGroups);
reducer, num_coeffs_to_preserve, rNumGroups).wait();
self.device().deallocate_temp(temp_pointer);
} else {
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
num_coeffs_to_reduce);
num_coeffs_to_reduce).wait();
}
return false;
}
@ -512,20 +509,19 @@ struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
local_range, inputSize, reducer);
local_range, inputSize, reducer).wait();
typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
EvaluatorPointerType, Index, local_range>
GenericRKernel;
dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
tmp_global_accessor, data,
cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
reducer);
reducer).wait();
dev.deallocate_temp(temp_pointer);
} else {
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
reducer);
reducer).wait();
}
}
};
@ -574,7 +570,7 @@ struct GenericReducer<Self, Op, Eigen::SyclDevice> {
dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)).wait();
return false;
}
};

View File

@ -368,13 +368,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;

View File

@ -507,13 +507,6 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
m_impl.cleanup();
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_output.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
const Device EIGEN_DEVICE_REF m_device;

View File

@ -89,8 +89,8 @@ struct ScanKernelFunctor {
LocalAccessor scratch;
Evaluator dev_eval;
OutAccessor out_accessor;
OutAccessor temp_accessor;
OutAccessor out_ptr;
OutAccessor tmp_ptr;
const ScanParameters<Index> scanParameters;
Op accumulator;
const bool inclusive;
@ -100,8 +100,8 @@ struct ScanKernelFunctor {
const bool inclusive_)
: scratch(scratch_),
dev_eval(dev_eval_),
out_accessor(out_accessor_),
temp_accessor(temp_accessor_),
out_ptr(out_accessor_),
tmp_ptr(temp_accessor_),
scanParameters(scanParameters_),
accumulator(accumulator_),
inclusive(inclusive_) {}
@ -131,9 +131,7 @@ struct ScanKernelFunctor {
first_step_inclusive_Operation(InclusiveOp) const {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto out_ptr = out_accessor.get_pointer();
auto tmp_ptr = temp_accessor.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();
for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
@ -190,7 +188,7 @@ struct ScanKernelFunctor {
}
private_offset *= 2;
}
scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
private_scan[PacketSize - 1 + packetIndex];
private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
// traverse down tree & build scan
@ -219,9 +217,9 @@ struct ScanKernelFunctor {
Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
CoeffReturnType accum = accumulator.initialize();
accumulator.reduce(scratch_ptr[ai], &accum);
accumulator.reduce(scratch_ptr[bi], &accum);
scratch_ptr[bi] = accumulator.finalize(accum);
accumulator.reduce(scratch[ai], &accum);
accumulator.reduce(scratch[bi], &accum);
scratch[bi] = accumulator.finalize(accum);
}
offset *= 2;
}
@ -234,10 +232,10 @@ struct ScanKernelFunctor {
scanParameters.non_scan_size +
group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
block_id;
tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
}
// clear the last element
scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
}
// traverse down tree & build scan
for (Index d = 1; d < scratch_stride; d *= 2) {
@ -248,10 +246,10 @@ struct ScanKernelFunctor {
Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
CoeffReturnType accum = accumulator.initialize();
accumulator.reduce(scratch_ptr[ai], &accum);
accumulator.reduce(scratch_ptr[bi], &accum);
scratch_ptr[ai] = scratch_ptr[bi];
scratch_ptr[bi] = accumulator.finalize(accum);
accumulator.reduce(scratch[ai], &accum);
accumulator.reduce(scratch[bi], &accum);
scratch[ai] = scratch[bi];
scratch[bi] = accumulator.finalize(accum);
}
}
// Synchronise
@ -262,7 +260,7 @@ struct ScanKernelFunctor {
EIGEN_UNROLL_LOOP
for (Index i = 0; i < PacketSize; i++) {
CoeffReturnType accum = private_scan[packetIndex + i];
accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
private_scan[packetIndex + i] = accumulator.finalize(accum);
}
}
@ -294,22 +292,20 @@ struct ScanAdjustmentKernelFunctor {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
InAccessor in_accessor;
OutAccessor out_accessor;
InAccessor in_ptr;
OutAccessor out_ptr;
const ScanParameters<Index> scanParameters;
Op accumulator;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
OutAccessor out_accessor_,
const ScanParameters<Index> scanParameters_,
Op accumulator_)
: in_accessor(in_accessor_),
out_accessor(out_accessor_),
: in_ptr(in_accessor_),
out_ptr(out_accessor_),
scanParameters(scanParameters_),
accumulator(accumulator_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto in_ptr = in_accessor.get_pointer();
auto out_ptr = out_accessor.get_pointer();
for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
@ -426,7 +422,7 @@ struct SYCLAdjustBlockOffset {
AdjustFuctor;
dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
scan_info.max_elements_per_block,
scan_info.get_scan_parameter(), accumulator);
scan_info.get_scan_parameter(), accumulator).wait();
}
};
@ -449,7 +445,7 @@ struct ScanLauncher_impl {
typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
scan_info.get_scan_parameter(), accumulator, inclusive);
scan_info.get_scan_parameter(), accumulator, inclusive).wait();
if (scan_info.block_size > 1) {
ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(

View File

@ -290,12 +290,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
Index input_index,

View File

@ -223,12 +223,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{

View File

@ -256,13 +256,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
return result;
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
// Given the output index, finds the first index in the input tensor used to compute the trace
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {

View File

@ -535,12 +535,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{

View File

@ -148,9 +148,6 @@ if(EIGEN_TEST_SYCL)
if(EIGEN_SYCL_REG_N)
add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
endif()
if(EIGEN_SYCL_USE_PROGRAM_CLASS)
add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
endif()
if(EIGEN_SYCL_ASYNC_EXECUTION)
add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
endif()

View File

@ -23,15 +23,6 @@
#include <stdint.h>
#include <iostream>
#ifdef SYCL_COMPILER_IS_DPCPP
template <typename T>
struct cl::sycl::is_device_copyable<
OffByOneScalar<T>,
std::enable_if_t<!(!std::is_trivially_copyable<OffByOneScalar<T>>::value &&
(std::is_const_v<OffByOneScalar<T>> || std::is_volatile_v<OffByOneScalar<T>>))>>
: std::true_type {};
#endif
template <typename DataType, int DataLayout, typename IndexType>
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
IndexType sizeDim1 = 100;
@ -39,7 +30,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) {
Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
// memset
memset(in1.data(), 1, in1.size() * sizeof(DataType));
sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
@ -47,7 +38,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) {
for (IndexType i=0; i<in.size(); i++) {
VERIFY_IS_EQUAL(in(i), in1(i));
}
// fill
DataType value = DataType(7);
std::fill_n(in1.data(), in1.size(), value);
@ -77,31 +68,6 @@ void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
sycl_device.deallocate(gpu_data);
}
template<typename DataType, int DataLayout, typename IndexType>
void test_device_attach_buffer(const Eigen::SyclDevice &sycl_device) {
IndexType sizeDim1 = 100;
array<IndexType, 1> tensorRange = {{sizeDim1}};
Tensor<DataType, 1, DataLayout, IndexType> in(tensorRange);
cl::sycl::buffer<buffer_scalar_t, 1> buffer(cl::sycl::range<1>(sizeDim1 * sizeof(DataType)));
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.attach_buffer(buffer));
// fill
DataType value = DataType(7);
std::fill_n(in.data(), in.size(), value);
sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
// Check that buffer is filled with the correct value.
auto reint = buffer.reinterpret<DataType>(cl::sycl::range<1>(sizeDim1));
auto access = reint.template get_access<cl::sycl::access::mode::read>();
for (IndexType i=0; i<in.size(); i++) {
VERIFY_IS_EQUAL(in(i), access[i]);
}
sycl_device.detach_buffer(gpu_in_data);
}
template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
QueueInterface queueInterface(d);
@ -112,7 +78,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
//test_device_exceptions<DataType, RowMajor>(sycl_device);
/// this test throw an exception. enable it if you want to see the exception
//test_device_exceptions<DataType, ColMajor>(sycl_device);
test_device_attach_buffer<DataType, ColMajor, int64_t>(sycl_device);
}
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {