mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-10 02:39:03 +08:00
[SYCL-2020] Enabling USM support for SYCL. SYCL-1.2.1 did not have support for USM.
This commit is contained in:
parent
1698c367a0
commit
0623791930
@ -249,7 +249,6 @@ using std::ptrdiff_t;
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_SYCL)
|
||||
#include "src/Core/arch/SYCL/SyclMemoryModel.h"
|
||||
#include "src/Core/arch/SYCL/InteropHeaders.h"
|
||||
#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
|
||||
#include "src/Core/arch/SYCL/PacketMath.h"
|
||||
|
@ -33,13 +33,9 @@ namespace internal {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
|
||||
pload##AlignedType<packet_type>( \
|
||||
const typename unpacket_traits<packet_type>::type* from) { \
|
||||
using scalar = typename unpacket_traits<packet_type>::type; \
|
||||
typedef cl::sycl::multi_ptr< \
|
||||
const scalar, cl::sycl::access::address_space::generic_space, \
|
||||
cl::sycl::access::decorated::no> \
|
||||
multi_ptr; \
|
||||
auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(from);\
|
||||
packet_type res{}; \
|
||||
res.load(0, multi_ptr(from)); \
|
||||
res.load(0, ptr); \
|
||||
return res; \
|
||||
}
|
||||
|
||||
@ -54,11 +50,8 @@ SYCL_PLOAD(cl::sycl::cl_double2, )
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||
scalar* to, const packet_type& from) { \
|
||||
typedef cl::sycl::multi_ptr< \
|
||||
scalar, cl::sycl::access::address_space::generic_space, \
|
||||
cl::sycl::access::decorated::no> \
|
||||
multi_ptr; \
|
||||
from.store(0, multi_ptr(to)); \
|
||||
auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(to);\
|
||||
from.store(0, ptr); \
|
||||
}
|
||||
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, )
|
||||
@ -370,60 +363,6 @@ inline cl::sycl::cl_double2 pblend(
|
||||
}
|
||||
#endif // SYCL_DEVICE_ONLY
|
||||
|
||||
template <typename packet_type, int Alignment, typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write, T>& from) {
|
||||
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
|
||||
}
|
||||
|
||||
#define SYCL_PLOAD(Alignment, AlignedType) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
||||
const Eigen::TensorSycl::internal::RangeAccess< \
|
||||
cl::sycl::access::mode::read_write, \
|
||||
typename unpacket_traits<packet_type>::type> \
|
||||
from) { \
|
||||
return ploadt_ro<packet_type, Alignment>(from); \
|
||||
}
|
||||
SYCL_PLOAD(Unaligned, u)
|
||||
SYCL_PLOAD(Aligned, )
|
||||
#undef SYCL_PLOAD
|
||||
|
||||
template <typename packet_type, int Alignment>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write,
|
||||
typename unpacket_traits<packet_type>::type>& from) {
|
||||
return ploadt<packet_type, Alignment>(from.get_pointer());
|
||||
}
|
||||
|
||||
#define SYCL_PSTORE(alignment) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||
const Eigen::TensorSycl::internal::RangeAccess< \
|
||||
cl::sycl::access::mode::read_write, \
|
||||
typename unpacket_traits<packet_type>::type>& to, \
|
||||
const packet_type& from) { \
|
||||
pstore##alignment(to.get_pointer(), from); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PSTORE()
|
||||
SYCL_PSTORE(u)
|
||||
|
||||
#undef SYCL_PSTORE
|
||||
|
||||
template <typename scalar, typename packet_type, int Alignment>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(
|
||||
Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write,
|
||||
typename unpacket_traits<packet_type>::type>
|
||||
to,
|
||||
const packet_type& from) {
|
||||
pstoret<scalar, packet_type, Alignment>(to.get_pointer(), from);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -1,700 +0,0 @@
|
||||
/***************************************************************************
|
||||
* Copyright (C) 2017 Codeplay Software Limited
|
||||
* This Source Code Form is subject to the terms of the Mozilla
|
||||
* Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
* with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
*
|
||||
* SyclMemoryModel.h
|
||||
*
|
||||
* Description:
|
||||
* Interface for SYCL buffers to behave as a non-dereferenceable pointer
|
||||
* Interface for Placeholder accessor to behave as a pointer on both host
|
||||
* and device
|
||||
*
|
||||
* Authors:
|
||||
*
|
||||
* Ruyman Reyes Codeplay Software Ltd.
|
||||
* Mehdi Goli Codeplay Software Ltd.
|
||||
* Vanya Yaneva Codeplay Software Ltd.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
#if defined(EIGEN_USE_SYCL) && \
|
||||
!defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
#include <stdexcept>
|
||||
#endif
|
||||
#include <cstddef>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "../../InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
|
||||
using sycl_acc_target = cl::sycl::access::target;
|
||||
using sycl_acc_mode = cl::sycl::access::mode;
|
||||
|
||||
/**
|
||||
* Default values for template arguments
|
||||
*/
|
||||
using buffer_data_type_t = uint8_t;
|
||||
const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer;
|
||||
const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write;
|
||||
|
||||
/**
|
||||
* PointerMapper
|
||||
* Associates fake pointers with buffers.
|
||||
*
|
||||
*/
|
||||
class PointerMapper {
|
||||
public:
|
||||
using base_ptr_t = std::intptr_t;
|
||||
|
||||
/* Structure of a virtual pointer
|
||||
*
|
||||
* |================================================|
|
||||
* | POINTER ADDRESS |
|
||||
* |================================================|
|
||||
*/
|
||||
struct virtual_pointer_t {
|
||||
/* Type for the pointers
|
||||
*/
|
||||
base_ptr_t m_contents;
|
||||
|
||||
/** Conversions from virtual_pointer_t to
|
||||
* void * should just reinterpret_cast the integer number
|
||||
*/
|
||||
operator void *() const { return reinterpret_cast<void *>(m_contents); }
|
||||
|
||||
/**
|
||||
* Convert back to the integer number.
|
||||
*/
|
||||
operator base_ptr_t() const { return m_contents; }
|
||||
|
||||
/**
|
||||
* Add a certain value to the pointer to create a
|
||||
* new pointer to that offset
|
||||
*/
|
||||
virtual_pointer_t operator+(size_t off) { return m_contents + off; }
|
||||
|
||||
/* Numerical order for sorting pointers in containers. */
|
||||
bool operator<(virtual_pointer_t rhs) const {
|
||||
return (static_cast<base_ptr_t>(m_contents) <
|
||||
static_cast<base_ptr_t>(rhs.m_contents));
|
||||
}
|
||||
|
||||
bool operator>(virtual_pointer_t rhs) const {
|
||||
return (static_cast<base_ptr_t>(m_contents) >
|
||||
static_cast<base_ptr_t>(rhs.m_contents));
|
||||
}
|
||||
|
||||
/**
|
||||
* Numerical order for sorting pointers in containers
|
||||
*/
|
||||
bool operator==(virtual_pointer_t rhs) const {
|
||||
return (static_cast<base_ptr_t>(m_contents) ==
|
||||
static_cast<base_ptr_t>(rhs.m_contents));
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple forward to the equality overload.
|
||||
*/
|
||||
bool operator!=(virtual_pointer_t rhs) const {
|
||||
return !(this->operator==(rhs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a void * into a virtual pointer structure.
|
||||
* Note that this will only work if the void * was
|
||||
* already a virtual_pointer_t, but we have no way of
|
||||
* checking
|
||||
*/
|
||||
virtual_pointer_t(const void *ptr)
|
||||
: m_contents(reinterpret_cast<base_ptr_t>(ptr)){};
|
||||
|
||||
/**
|
||||
* Creates a virtual_pointer_t from the given integer
|
||||
* number
|
||||
*/
|
||||
virtual_pointer_t(base_ptr_t u) : m_contents(u){};
|
||||
};
|
||||
|
||||
/* Definition of a null pointer
|
||||
*/
|
||||
const virtual_pointer_t null_virtual_ptr = nullptr;
|
||||
|
||||
/**
|
||||
* Whether if a pointer is null or not.
|
||||
* A pointer is nullptr if the value is of null_virtual_ptr
|
||||
*/
|
||||
static inline bool is_nullptr(virtual_pointer_t ptr) {
|
||||
return (static_cast<void *>(ptr) == nullptr);
|
||||
}
|
||||
|
||||
/* basic type for all buffers
|
||||
*/
|
||||
using buffer_t = cl::sycl::buffer<buffer_data_type_t>;
|
||||
|
||||
/**
|
||||
* Node that stores information about a device allocation.
|
||||
* Nodes are sorted by size to organise a free list of nodes
|
||||
* that can be recovered.
|
||||
*/
|
||||
struct pMapNode_t {
|
||||
buffer_t m_buffer;
|
||||
size_t m_size;
|
||||
bool m_free;
|
||||
|
||||
pMapNode_t(buffer_t b, size_t size, bool f)
|
||||
: m_buffer{b}, m_size{size}, m_free{f} {
|
||||
m_buffer.set_final_data(nullptr);
|
||||
}
|
||||
|
||||
bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); }
|
||||
};
|
||||
|
||||
/** Storage of the pointer / buffer tree
|
||||
*/
|
||||
using pointerMap_t = std::map<virtual_pointer_t, pMapNode_t>;
|
||||
|
||||
/**
|
||||
* Obtain the insertion point in the pointer map for
|
||||
* a pointer of the given size.
|
||||
* \param requiredSize Size attempted to reclaim
|
||||
*/
|
||||
typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) {
|
||||
typename pointerMap_t::iterator retVal;
|
||||
bool reuse = false;
|
||||
if (!m_freeList.empty()) {
|
||||
// try to re-use an existing block
|
||||
for (auto freeElem : m_freeList) {
|
||||
if (freeElem->second.m_size >= requiredSize) {
|
||||
retVal = freeElem;
|
||||
reuse = true;
|
||||
// Element is not going to be free anymore
|
||||
m_freeList.erase(freeElem);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!reuse) {
|
||||
retVal = std::prev(m_pointerMap.end());
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator to the node that stores the information
|
||||
* of the given virtual pointer from the given pointer map structure.
|
||||
* If pointer is not found, throws std::out_of_range.
|
||||
* If the pointer map structure is empty, throws std::out_of_range
|
||||
*
|
||||
* \param pMap the pointerMap_t structure storing all the pointers
|
||||
* \param virtual_pointer_ptr The virtual pointer to obtain the node of
|
||||
* \throws std::out:of_range if the pointer is not found or pMap is empty
|
||||
*/
|
||||
typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) {
|
||||
if (this->count() == 0) {
|
||||
m_pointerMap.clear();
|
||||
EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n"));
|
||||
|
||||
}
|
||||
if (is_nullptr(ptr)) {
|
||||
m_pointerMap.clear();
|
||||
EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n"));
|
||||
}
|
||||
// The previous element to the lower bound is the node that
|
||||
// holds this memory address
|
||||
auto node = m_pointerMap.lower_bound(ptr);
|
||||
// If the value of the pointer is not the one of the node
|
||||
// then we return the previous one
|
||||
if (node == std::end(m_pointerMap)) {
|
||||
--node;
|
||||
} else if (node->first != ptr) {
|
||||
if (node == std::begin(m_pointerMap)) {
|
||||
m_pointerMap.clear();
|
||||
EIGEN_THROW_X(
|
||||
std::out_of_range("The pointer is not registered in the map\n"));
|
||||
}
|
||||
--node;
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* get_buffer.
|
||||
* Returns a buffer from the map using the pointer address
|
||||
*/
|
||||
template <typename buffer_data_type = buffer_data_type_t>
|
||||
cl::sycl::buffer<buffer_data_type, 1> get_buffer(
|
||||
const virtual_pointer_t ptr) {
|
||||
|
||||
auto node = get_node(ptr);
|
||||
auto& map_node = node->second;
|
||||
eigen_assert(node->first == ptr || node->first < ptr);
|
||||
eigen_assert(ptr < static_cast<virtual_pointer_t>(map_node.m_size +
|
||||
node->first));
|
||||
return map_node.m_buffer.reinterpret<buffer_data_type>(
|
||||
cl::sycl::range<1>{map_node.m_size / sizeof(buffer_data_type)});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Returns an accessor to the buffer of the given virtual pointer
|
||||
* @param accessMode
|
||||
* @param accessTarget
|
||||
* @param ptr The virtual pointer
|
||||
*/
|
||||
template <sycl_acc_mode access_mode = default_acc_mode,
|
||||
sycl_acc_target access_target = default_acc_target,
|
||||
typename buffer_data_type = buffer_data_type_t>
|
||||
cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
|
||||
get_access(const virtual_pointer_t ptr) {
|
||||
auto buf = get_buffer<buffer_data_type>(ptr);
|
||||
return buf.template get_access<access_mode, access_target>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Returns an accessor to the buffer of the given virtual pointer
|
||||
* in the given command group scope
|
||||
* @param accessMode
|
||||
* @param accessTarget
|
||||
* @param ptr The virtual pointer
|
||||
* @param cgh Reference to the command group scope
|
||||
*/
|
||||
template <sycl_acc_mode access_mode = default_acc_mode,
|
||||
sycl_acc_target access_target = default_acc_target,
|
||||
typename buffer_data_type = buffer_data_type_t>
|
||||
cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
|
||||
get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) {
|
||||
auto buf = get_buffer<buffer_data_type>(ptr);
|
||||
return buf.template get_access<access_mode, access_target>(cgh);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the offset from the base address of this pointer.
|
||||
*/
|
||||
inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) {
|
||||
// The previous element to the lower bound is the node that
|
||||
// holds this memory address
|
||||
auto node = get_node(ptr);
|
||||
auto start = node->first;
|
||||
eigen_assert(start == ptr || start < ptr);
|
||||
eigen_assert(ptr < start + node->second.m_size);
|
||||
return (ptr - start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of elements by which the given pointer is offset from
|
||||
* the base address.
|
||||
*/
|
||||
template <typename buffer_data_type>
|
||||
inline size_t get_element_offset(const virtual_pointer_t ptr) {
|
||||
return get_offset(ptr) / sizeof(buffer_data_type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the PointerMapper structure.
|
||||
*/
|
||||
PointerMapper(base_ptr_t baseAddress = 4096)
|
||||
: m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} {
|
||||
if (m_baseAddress == 0) {
|
||||
EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n"));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* PointerMapper cannot be copied or moved
|
||||
*/
|
||||
PointerMapper(const PointerMapper &) = delete;
|
||||
|
||||
/**
|
||||
* Empty the pointer list
|
||||
*/
|
||||
inline void clear() {
|
||||
m_freeList.clear();
|
||||
m_pointerMap.clear();
|
||||
}
|
||||
|
||||
/* add_pointer.
|
||||
* Adds an existing pointer to the map and returns the virtual pointer id.
|
||||
*/
|
||||
inline virtual_pointer_t add_pointer(const buffer_t &b) {
|
||||
return add_pointer_impl(b);
|
||||
}
|
||||
|
||||
/* add_pointer.
|
||||
* Adds a pointer to the map and returns the virtual pointer id.
|
||||
*/
|
||||
inline virtual_pointer_t add_pointer(buffer_t &&b) {
|
||||
return add_pointer_impl(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Fuses the given node with the previous nodes in the
|
||||
* pointer map if they are free
|
||||
*
|
||||
* @param node A reference to the free node to be fused
|
||||
*/
|
||||
void fuse_forward(typename pointerMap_t::iterator &node) {
|
||||
while (node != std::prev(m_pointerMap.end())) {
|
||||
// if following node is free
|
||||
// remove it and extend the current node with its size
|
||||
auto fwd_node = std::next(node);
|
||||
if (!fwd_node->second.m_free) {
|
||||
break;
|
||||
}
|
||||
auto fwd_size = fwd_node->second.m_size;
|
||||
m_freeList.erase(fwd_node);
|
||||
m_pointerMap.erase(fwd_node);
|
||||
|
||||
node->second.m_size += fwd_size;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Fuses the given node with the following nodes in the
|
||||
* pointer map if they are free
|
||||
*
|
||||
* @param node A reference to the free node to be fused
|
||||
*/
|
||||
void fuse_backward(typename pointerMap_t::iterator &node) {
|
||||
while (node != m_pointerMap.begin()) {
|
||||
// if previous node is free, extend it
|
||||
// with the size of the current one
|
||||
auto prev_node = std::prev(node);
|
||||
if (!prev_node->second.m_free) {
|
||||
break;
|
||||
}
|
||||
prev_node->second.m_size += node->second.m_size;
|
||||
|
||||
// remove the current node
|
||||
m_freeList.erase(node);
|
||||
m_pointerMap.erase(node);
|
||||
|
||||
// point to the previous node
|
||||
node = prev_node;
|
||||
}
|
||||
}
|
||||
|
||||
/* remove_pointer.
|
||||
* Removes the given pointer from the map.
|
||||
* The pointer is allowed to be reused only if ReUse if true.
|
||||
*/
|
||||
template <bool ReUse = true>
|
||||
void remove_pointer(const virtual_pointer_t ptr) {
|
||||
if (is_nullptr(ptr)) {
|
||||
return;
|
||||
}
|
||||
auto node = this->get_node(ptr);
|
||||
|
||||
node->second.m_free = true;
|
||||
m_freeList.emplace(node);
|
||||
|
||||
// Fuse the node
|
||||
// with free nodes before and after it
|
||||
fuse_forward(node);
|
||||
fuse_backward(node);
|
||||
|
||||
// If after fusing the node is the last one
|
||||
// simply remove it (since it is free)
|
||||
if (node == std::prev(m_pointerMap.end())) {
|
||||
m_freeList.erase(node);
|
||||
m_pointerMap.erase(node);
|
||||
}
|
||||
}
|
||||
|
||||
/* count.
|
||||
* Return the number of active pointers (i.e, pointers that
|
||||
* have been malloc but not freed).
|
||||
*/
|
||||
size_t count() const { return (m_pointerMap.size() - m_freeList.size()); }
|
||||
|
||||
private:
|
||||
/* add_pointer_impl.
|
||||
* Adds a pointer to the map and returns the virtual pointer id.
|
||||
* BufferT is either a const buffer_t& or a buffer_t&&.
|
||||
*/
|
||||
template <class BufferT>
|
||||
virtual_pointer_t add_pointer_impl(BufferT b) {
|
||||
virtual_pointer_t retVal = nullptr;
|
||||
size_t bufSize = b.get_count() * sizeof(buffer_data_type_t);
|
||||
auto byte_buffer =
|
||||
b.template reinterpret<buffer_data_type_t>(cl::sycl::range<1>{bufSize});
|
||||
pMapNode_t p{byte_buffer, bufSize, false};
|
||||
|
||||
// If this is the first pointer:
|
||||
if (m_pointerMap.empty()) {
|
||||
virtual_pointer_t initialVal{m_baseAddress};
|
||||
m_pointerMap.emplace(initialVal, p);
|
||||
return initialVal;
|
||||
}
|
||||
|
||||
auto lastElemIter = get_insertion_point(bufSize);
|
||||
// We are recovering an existing free node
|
||||
if (lastElemIter->second.m_free) {
|
||||
lastElemIter->second.m_buffer = b;
|
||||
lastElemIter->second.m_free = false;
|
||||
|
||||
// If the recovered node is bigger than the inserted one
|
||||
// add a new free node with the remaining space
|
||||
if (lastElemIter->second.m_size > bufSize) {
|
||||
// create a new node with the remaining space
|
||||
auto remainingSize = lastElemIter->second.m_size - bufSize;
|
||||
pMapNode_t p2{b, remainingSize, true};
|
||||
|
||||
// update size of the current node
|
||||
lastElemIter->second.m_size = bufSize;
|
||||
|
||||
// add the new free node
|
||||
auto newFreePtr = lastElemIter->first + bufSize;
|
||||
auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first;
|
||||
m_freeList.emplace(freeNode);
|
||||
}
|
||||
|
||||
retVal = lastElemIter->first;
|
||||
} else {
|
||||
size_t lastSize = lastElemIter->second.m_size;
|
||||
retVal = lastElemIter->first + lastSize;
|
||||
m_pointerMap.emplace(retVal, p);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two iterators to pointer map entries according to
|
||||
* the size of the allocation on the device.
|
||||
*/
|
||||
struct SortBySize {
|
||||
bool operator()(typename pointerMap_t::iterator a,
|
||||
typename pointerMap_t::iterator b) const {
|
||||
return ((a->first < b->first) && (a->second <= b->second)) ||
|
||||
((a->first < b->first) && (b->second <= a->second));
|
||||
}
|
||||
};
|
||||
|
||||
/* Maps the pointer addresses to buffer and size pairs.
|
||||
*/
|
||||
pointerMap_t m_pointerMap;
|
||||
|
||||
/* List of free nodes available for re-using
|
||||
*/
|
||||
std::set<typename pointerMap_t::iterator, SortBySize> m_freeList;
|
||||
|
||||
/* Base address used when issuing the first virtual pointer, allows users
|
||||
* to specify alignment. Cannot be zero. */
|
||||
std::intptr_t m_baseAddress;
|
||||
};
|
||||
|
||||
/* remove_pointer.
|
||||
* Removes the given pointer from the map.
|
||||
* The pointer is allowed to be reused only if ReUse if true.
|
||||
*/
|
||||
template <>
|
||||
inline void PointerMapper::remove_pointer<false>(const virtual_pointer_t ptr) {
|
||||
if (is_nullptr(ptr)) {
|
||||
return;
|
||||
}
|
||||
m_pointerMap.erase(this->get_node(ptr));
|
||||
}
|
||||
|
||||
/**
|
||||
* Malloc-like interface to the pointer-mapper.
|
||||
* Given a size, creates a byte-typed buffer and returns a
|
||||
* fake pointer to keep track of it.
|
||||
* \param size Size in bytes of the desired allocation
|
||||
* \throw cl::sycl::exception if error while creating the buffer
|
||||
*/
|
||||
inline void *SYCLmalloc(size_t size, PointerMapper &pMap) {
|
||||
if (size == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
// Create a generic buffer of the given size
|
||||
using buffer_t = cl::sycl::buffer<buffer_data_type_t, 1>;
|
||||
auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size}));
|
||||
// Store the buffer on the global list
|
||||
return static_cast<void *>(thePointer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Free-like interface to the pointer mapper.
|
||||
* Given a fake-pointer created with the virtual-pointer malloc,
|
||||
* destroys the buffer and remove it from the list.
|
||||
* If ReUse is false, the pointer is not added to the freeList,
|
||||
* it should be false only for sub-buffers.
|
||||
*/
|
||||
template <bool ReUse = true, typename PointerMapper>
|
||||
inline void SYCLfree(void *ptr, PointerMapper &pMap) {
|
||||
pMap.template remove_pointer<ReUse>(ptr);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all the memory allocated by SYCL.
|
||||
*/
|
||||
template <typename PointerMapper>
|
||||
inline void SYCLfreeAll(PointerMapper &pMap) {
|
||||
pMap.clear();
|
||||
}
|
||||
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
struct RangeAccess {
|
||||
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
|
||||
typedef T scalar_t;
|
||||
typedef scalar_t &ref_t;
|
||||
typedef scalar_t *ptr_t;
|
||||
|
||||
// the accessor type does not necessarily the same as T
|
||||
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
|
||||
accessor;
|
||||
|
||||
typedef RangeAccess<AcMd, T> self_t;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access,
|
||||
size_t offset,
|
||||
std::intptr_t virtual_ptr)
|
||||
: access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {}
|
||||
|
||||
RangeAccess(cl::sycl::buffer<scalar_t, 1> buff =
|
||||
cl::sycl::buffer<scalar_t, 1>(cl::sycl::range<1>(1)))
|
||||
: access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {}
|
||||
|
||||
// This should be only used for null constructor on the host side
|
||||
RangeAccess(std::nullptr_t) : RangeAccess() {}
|
||||
// This template parameter must be removed and scalar_t should be replaced
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
|
||||
typedef cl::sycl::multi_ptr<scalar_t,
|
||||
cl::sycl::access::address_space::generic_space,
|
||||
cl::sycl::access::decorated::no>
|
||||
multi_ptr;
|
||||
multi_ptr p(access_);
|
||||
return (p + offset_).get_raw();
|
||||
}
|
||||
template <typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
|
||||
offset_ += (offset);
|
||||
return *this;
|
||||
}
|
||||
template <typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const {
|
||||
return self_t(access_, offset_ + offset, virtual_ptr_);
|
||||
}
|
||||
template <typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const {
|
||||
return self_t(access_, offset_ - offset, virtual_ptr_);
|
||||
}
|
||||
template <typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) {
|
||||
offset_ -= offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// THIS IS FOR NULL COMPARISON ONLY
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
|
||||
const RangeAccess &lhs, std::nullptr_t) {
|
||||
return ((lhs.virtual_ptr_ == -1));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
|
||||
const RangeAccess &lhs, std::nullptr_t i) {
|
||||
return !(lhs == i);
|
||||
}
|
||||
|
||||
// THIS IS FOR NULL COMPARISON ONLY
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
|
||||
std::nullptr_t, const RangeAccess &rhs) {
|
||||
return ((rhs.virtual_ptr_ == -1));
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
|
||||
std::nullptr_t i, const RangeAccess &rhs) {
|
||||
return !(i == rhs);
|
||||
}
|
||||
// Prefix operator (Increment and return value)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() {
|
||||
offset_++;
|
||||
return (*this);
|
||||
}
|
||||
|
||||
// Postfix operator (Return value and increment)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) {
|
||||
EIGEN_UNUSED_VARIABLE(i);
|
||||
self_t temp_iterator(*this);
|
||||
offset_++;
|
||||
return temp_iterator;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const {
|
||||
return (access_.get_count() - offset_);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const {
|
||||
return offset_;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) {
|
||||
offset_ = offset;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const {
|
||||
return *get_pointer();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() {
|
||||
return *get_pointer();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) {
|
||||
return *(get_pointer() + x);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const {
|
||||
return *(get_pointer() + x);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const {
|
||||
return reinterpret_cast<scalar_t *>(virtual_ptr_ +
|
||||
(offset_ * sizeof(scalar_t)));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const {
|
||||
return (virtual_ptr_ != -1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess<AcMd, const T>() {
|
||||
return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
operator RangeAccess<AcMd, const T>() const {
|
||||
return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
|
||||
}
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
|
||||
cl::sycl::handler &cgh) const {
|
||||
cgh.require(access_);
|
||||
}
|
||||
|
||||
private:
|
||||
accessor access_;
|
||||
size_t offset_;
|
||||
std::intptr_t virtual_ptr_; // the location of the buffer in the map
|
||||
};
|
||||
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
struct RangeAccess<AcMd, const T> : RangeAccess<AcMd, T> {
|
||||
typedef RangeAccess<AcMd, T> Base;
|
||||
using Base::Base;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace TensorSycl
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
|
@ -531,12 +531,12 @@
|
||||
#error "NVCC as the target platform for HIPCC is currently not supported."
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
||||
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Means the compiler is either nvcc or clang with CUDA enabled
|
||||
#define EIGEN_CUDACC __CUDACC__
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
|
||||
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Means we are generating code for the device
|
||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||
#endif
|
||||
@ -548,7 +548,7 @@
|
||||
#define EIGEN_CUDA_SDK_VER 0
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
|
||||
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
|
||||
#define EIGEN_HIPCC __HIPCC__
|
||||
|
||||
@ -557,7 +557,7 @@
|
||||
// ++ host_defines.h which contains the defines for the __host__ and __device__ macros
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
#if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// analogous to EIGEN_CUDA_ARCH, but for HIP
|
||||
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
|
||||
#endif
|
||||
|
@ -2,17 +2,13 @@
|
||||
// A Scalar with internal representation T+1 so that zero is internally
|
||||
// represented by T(1). This is used to test memory fill.
|
||||
//
|
||||
#pragma once
|
||||
template<typename T>
|
||||
class OffByOneScalar {
|
||||
public:
|
||||
OffByOneScalar() : val_(1) {}
|
||||
OffByOneScalar(const OffByOneScalar& other) {
|
||||
*this = other;
|
||||
}
|
||||
OffByOneScalar& operator=(const OffByOneScalar& other) {
|
||||
val_ = other.val_;
|
||||
return *this;
|
||||
}
|
||||
OffByOneScalar(const OffByOneScalar& other) = default;
|
||||
OffByOneScalar& operator=(const OffByOneScalar& other) = default;
|
||||
|
||||
OffByOneScalar(T val) : val_(val + 1) {}
|
||||
OffByOneScalar& operator=(T val) {
|
||||
|
@ -128,12 +128,6 @@ struct TensorEvaluator<const TensorIndexPairOp<ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
};
|
||||
@ -278,12 +272,6 @@ struct TensorEvaluator<const TensorPairReducerOp<ReduceOp, Dims, ArgType>, Devic
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
m_orig_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
|
@ -228,14 +228,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
block.cleanup();
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_leftImpl.bind(cgh);
|
||||
m_rightImpl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
|
||||
|
||||
private:
|
||||
|
@ -687,13 +687,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
Broadcast functor() const { return m_broadcast; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
|
||||
cl::sycl::handler& cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
static constexpr bool IsColMajor =
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor);
|
||||
|
@ -355,12 +355,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
|
@ -279,14 +279,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_leftImpl.bind(cgh);
|
||||
m_rightImpl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
|
@ -61,13 +61,6 @@ struct CoeffLoader {
|
||||
return m_tensor.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// The placeholder accessors require to be bound to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_tensor.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
const Tensor m_tensor;
|
||||
};
|
||||
@ -97,12 +90,6 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
|
||||
return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// The placeholder accessors require to be bound to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_data.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
typedef typename Tensor::Scalar Scalar;
|
||||
|
||||
@ -250,13 +237,6 @@ class SimpleTensorContractionMapper {
|
||||
return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// The placeholder accessors require to be bound to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_tensor.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
|
||||
return m_tensor;
|
||||
}
|
||||
@ -508,13 +488,6 @@ class TensorContractionSubMapper {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// The placeholder accessors require to be bound to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_base_mapper.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
const ParentMapper& base_mapper() const { return m_base_mapper; }
|
||||
Index vert_offset() const { return m_vert_offset; }
|
||||
Index horiz_offset() const { return m_horiz_offset; }
|
||||
|
@ -627,7 +627,7 @@ class TensorContractionKernel {
|
||||
ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
|
||||
mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
|
||||
|
||||
auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
|
||||
auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
|
||||
|
||||
(thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
|
||||
: compute_panel<false>(itemID, thread_properties, out_ptr);
|
||||
@ -1048,7 +1048,7 @@ struct GeneralVectorTensor {
|
||||
is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
|
||||
const StorageIndex contractGroupId =
|
||||
is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
|
||||
auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
|
||||
auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);
|
||||
|
||||
const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
|
||||
const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
|
||||
@ -1255,8 +1255,8 @@ struct GeneralScalarContraction {
|
||||
|
||||
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
|
||||
auto out_ptr = out_res.get_pointer();
|
||||
auto scratch_ptr = scratch.get_pointer().get();
|
||||
auto out_ptr = out_res;
|
||||
OutScalar * scratch_ptr = scratch.get_pointer();
|
||||
|
||||
StorageIndex globalid = itemID.get_global_id(0);
|
||||
StorageIndex localid = itemID.get_local_id(0);
|
||||
@ -1395,13 +1395,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
|
||||
typedef internal::TensorContractionInputMapper<
|
||||
LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
|
||||
PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
|
||||
PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
|
||||
LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
|
||||
right_nocontract_t, contract_t,
|
||||
PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
|
||||
rhs_inner_dim_reordered, Unaligned, MakePointer>
|
||||
RhsMapper;
|
||||
|
||||
// initialize data mappers
|
||||
@ -1505,8 +1505,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
|
||||
PacketAccess, input_mapper_properties, true, ct>
|
||||
ContractKernelName;
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim).wait();
|
||||
} else {
|
||||
typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
|
||||
LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
|
||||
@ -1518,7 +1518,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
|
||||
triple_dim);
|
||||
triple_dim).wait();
|
||||
|
||||
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
|
||||
auto op = Op();
|
||||
@ -1531,8 +1531,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
|
||||
Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
|
||||
cl::sycl::range<1>(localRange)),
|
||||
StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
|
||||
|
||||
StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK).wait();
|
||||
device().deallocate_temp(temp_pointer);
|
||||
}
|
||||
}
|
||||
@ -1566,28 +1565,28 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
|
||||
EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
|
||||
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
|
||||
|
||||
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
|
||||
typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
|
||||
EvaluatorPointerType, Op>
|
||||
ReductionKernel;
|
||||
|
||||
device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
|
||||
device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
|
||||
tmp_global_accessor, buffer,
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
|
||||
cl::sycl::range<1>(localRange)),
|
||||
StorageIndex(1), Op(), nonContractDim, cNumGroups);
|
||||
|
||||
StorageIndex(1), Op(), nonContractDim, cNumGroups).wait();
|
||||
device().deallocate_temp(temp_pointer);
|
||||
} else {
|
||||
typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
|
||||
TensorMapper, StorageIndex, Properties, CFactor, false,
|
||||
is_lhs_vec, true>
|
||||
ContractKernelName;
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
|
||||
vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1616,19 +1615,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
|
||||
EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
|
||||
thread_range, local_range, K);
|
||||
thread_range, local_range, K).wait();
|
||||
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
|
||||
typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
|
||||
EvaluatorPointerType, StorageIndex, local_range>
|
||||
GenericRKernel;
|
||||
device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
|
||||
tmp_global_accessor, buffer,
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
|
||||
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()).wait();
|
||||
device().deallocate_temp(temp_pointer);
|
||||
} else {
|
||||
device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
|
||||
local_range, K);
|
||||
local_range, K).wait();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1642,12 +1640,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
this->m_result = NULL;
|
||||
}
|
||||
}
|
||||
// The placeholder accessors must bound to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
this->m_leftImpl.bind(cgh);
|
||||
this->m_rightImpl.bind(cgh);
|
||||
this->m_result.bind(cgh);
|
||||
}
|
||||
};
|
||||
} // namespace Eigen
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
|
||||
|
@ -442,12 +442,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
|
||||
/// required by sycl in order to extract the sycl accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
|
@ -61,8 +61,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
||||
return (boolean_check[0] && boolean_check[1]);
|
||||
}
|
||||
void operator()(cl::sycl::nd_item<2> itemID) const {
|
||||
auto buffer_ptr = buffer_acc.get_pointer();
|
||||
auto kernel_ptr = kernel_filter.get_pointer();
|
||||
auto buffer_ptr = buffer_acc;
|
||||
auto kernel_ptr = kernel_filter;
|
||||
// the required row to be calculated for the for each plane in shered memory
|
||||
const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
|
||||
const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
|
||||
@ -128,8 +128,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
||||
}
|
||||
|
||||
void operator()(cl::sycl::nd_item<3> itemID) const {
|
||||
auto buffer_ptr = buffer_acc.get_pointer();
|
||||
auto kernel_ptr = kernel_filter.get_pointer();
|
||||
auto buffer_ptr = buffer_acc;
|
||||
auto kernel_ptr = kernel_filter;
|
||||
// the required row to be calculated for the for each plane in shered memory
|
||||
const auto num_input = cl::sycl::range<2>{
|
||||
(cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
|
||||
@ -216,8 +216,8 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
||||
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
||||
}
|
||||
void operator()(cl::sycl::nd_item<3> itemID) const {
|
||||
auto buffer_ptr = buffer_acc.get_pointer();
|
||||
auto kernel_ptr = kernel_filter.get_pointer();
|
||||
auto buffer_ptr = buffer_acc;
|
||||
auto kernel_ptr = kernel_filter;
|
||||
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
|
||||
|
||||
const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
|
||||
@ -411,7 +411,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
|
||||
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
|
||||
m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
|
||||
indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
|
||||
indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])).wait();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -442,7 +442,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
ConvKernel;
|
||||
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
|
||||
m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
|
||||
indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
|
||||
indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}).wait();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -482,7 +482,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
ConvKernel;
|
||||
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
|
||||
m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
|
||||
indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
|
||||
indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP).wait();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -519,13 +519,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
|
||||
}
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_kernelImpl.bind(cgh);
|
||||
m_inputImpl.bind(cgh);
|
||||
m_buf.bind(cgh);
|
||||
m_kernel.bind(cgh);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
// No assignment (copies are needed by the kernels)
|
||||
|
@ -151,13 +151,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_result.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
void evalTo(EvaluatorPointerType data) {
|
||||
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
|
||||
@ -324,12 +317,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_result.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
void evalTo(EvaluatorPointerType data) {
|
||||
|
@ -31,7 +31,8 @@ struct SyclDeviceInfo {
|
||||
.template get_info<cl::sycl::info::device::local_mem_type>()),
|
||||
max_work_item_sizes(
|
||||
queue.get_device()
|
||||
.template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
|
||||
.template get_info<
|
||||
cl::sycl::info::device::max_work_item_sizes<3>>()),
|
||||
max_mem_alloc_size(
|
||||
queue.get_device()
|
||||
.template get_info<
|
||||
@ -69,7 +70,6 @@ struct SyclDeviceInfo {
|
||||
} // end namespace internal
|
||||
} // end namespace TensorSycl
|
||||
|
||||
typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
|
||||
// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
|
||||
// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
|
||||
// TensorFlow via the Eigen SYCL Backend.
|
||||
@ -110,147 +110,52 @@ class QueueInterface {
|
||||
explicit QueueInterface(
|
||||
const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
|
||||
unsigned num_threads = std::thread::hardware_concurrency())
|
||||
: m_queue(dev_or_sel, handler),
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
m_prog(m_queue.get_context(), get_sycl_supported_devices()),
|
||||
#endif
|
||||
: m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
|
||||
m_thread_pool(num_threads),
|
||||
m_device_info(m_queue) {
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
m_prog.build_with_kernel_type<DeviceOrSelector>();
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
|
||||
[=]() {})
|
||||
};
|
||||
EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
|
||||
#endif
|
||||
}
|
||||
m_device_info(m_queue) {}
|
||||
|
||||
template <typename DeviceOrSelector>
|
||||
explicit QueueInterface(
|
||||
const DeviceOrSelector &dev_or_sel,
|
||||
unsigned num_threads = std::thread::hardware_concurrency())
|
||||
: QueueInterface(dev_or_sel,
|
||||
[this](cl::sycl::exception_list l) {
|
||||
this->exception_caught_ = this->sycl_async_handler(l);
|
||||
},
|
||||
num_threads) {}
|
||||
|
||||
: QueueInterface(
|
||||
dev_or_sel,
|
||||
[this](cl::sycl::exception_list l) {
|
||||
this->exception_caught_ = this->sycl_async_handler(l);
|
||||
},
|
||||
num_threads) {}
|
||||
|
||||
explicit QueueInterface(
|
||||
const cl::sycl::queue& q, unsigned num_threads = std::thread::hardware_concurrency())
|
||||
: m_queue(q),
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
m_prog(m_queue.get_context(), get_sycl_supported_devices()),
|
||||
#endif
|
||||
m_thread_pool(num_threads),
|
||||
m_device_info(m_queue) {}
|
||||
const cl::sycl::queue &q,
|
||||
unsigned num_threads = std::thread::hardware_concurrency())
|
||||
: m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}
|
||||
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
|
||||
#endif
|
||||
|
||||
/// Attach an existing buffer to the pointer map, Eigen will not reuse it
|
||||
EIGEN_STRONG_INLINE void *attach_buffer(
|
||||
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
return static_cast<void *>(pMapper.add_pointer(buf));
|
||||
}
|
||||
|
||||
/// Detach previously attached buffer
|
||||
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
TensorSycl::internal::SYCLfree<false>(p, pMapper);
|
||||
}
|
||||
|
||||
/// Allocating device pointer. This pointer is actually an 8 bytes host
|
||||
/// pointer used as key to access the sycl device buffer. The reason is that
|
||||
/// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
|
||||
/// expressions. So we create a key pointer to be used in Eigen expression
|
||||
/// construction. When we convert the Eigen construction into the sycl
|
||||
/// construction we use this pointer as a key in our buffer_map and we make
|
||||
/// sure that we dedicate only one buffer only for this pointer. The device
|
||||
/// pointer would be deleted by calling deallocate function.
|
||||
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
|
||||
#if EIGEN_MAX_ALIGN_BYTES > 0
|
||||
size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
|
||||
if (align > 0) {
|
||||
num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
|
||||
}
|
||||
return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES,
|
||||
num_bytes, m_queue);
|
||||
#else
|
||||
return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
|
||||
#endif
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
|
||||
#if EIGEN_MAX_ALIGN_BYTES > 0
|
||||
size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
|
||||
if (align > 0) {
|
||||
num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
|
||||
}
|
||||
#endif
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
|
||||
if (scratch_buffers.empty()) {
|
||||
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
|
||||
;
|
||||
} else {
|
||||
for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
|
||||
auto buff = pMapper.get_buffer(*it);
|
||||
if (buff.get_size() >= num_bytes) {
|
||||
auto ptr = *it;
|
||||
scratch_buffers.erase(it);
|
||||
return ptr;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
|
||||
}
|
||||
#else
|
||||
return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
|
||||
#endif
|
||||
}
|
||||
template <typename data_t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write, data_t>
|
||||
get(data_t *data) const {
|
||||
return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
|
||||
}
|
||||
template <typename data_t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
|
||||
TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
|
||||
data_t>
|
||||
data) const {
|
||||
return static_cast<data_t *>(data.get_virtual_pointer());
|
||||
return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
|
||||
scratch_buffers.insert(p);
|
||||
#else
|
||||
TensorSycl::internal::SYCLfree(p, pMapper);
|
||||
#endif
|
||||
}
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(
|
||||
const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
|
||||
deallocate_temp(p.get_virtual_pointer());
|
||||
template <typename data_t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
|
||||
return data;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const {
|
||||
deallocate_temp(const_cast<void *>(p));
|
||||
}
|
||||
|
||||
/// This is used to deallocate the device pointer. p is used as a key inside
|
||||
/// the map to find the device buffer and delete it.
|
||||
EIGEN_STRONG_INLINE void deallocate(void *p) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
TensorSycl::internal::SYCLfree(p, pMapper);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate_all() const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
TensorSycl::internal::SYCLfreeAll(pMapper);
|
||||
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
|
||||
scratch_buffers.clear();
|
||||
#endif
|
||||
cl::sycl::free(p, m_queue);
|
||||
}
|
||||
|
||||
/// The memcpyHostToDevice is used to copy the data from host to device
|
||||
@ -260,24 +165,7 @@ class QueueInterface {
|
||||
EIGEN_STRONG_INLINE void memcpyHostToDevice(
|
||||
void *dst, const void *src, size_t n,
|
||||
std::function<void()> callback) const {
|
||||
static const auto write_mode = cl::sycl::access::mode::discard_write;
|
||||
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||
typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
|
||||
write_accessor;
|
||||
if (n == 0) {
|
||||
if (callback) callback();
|
||||
return;
|
||||
}
|
||||
n /= sizeof(buffer_scalar_t);
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
|
||||
buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
|
||||
auto non_deleter = [](buffer_scalar_t const *) {};
|
||||
std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
|
||||
cgh.copy(s_ptr, dst_acc);
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||
auto e = m_queue.memcpy(dst, src, n);
|
||||
synchronize_and_callback(e, callback);
|
||||
}
|
||||
|
||||
@ -288,24 +176,11 @@ class QueueInterface {
|
||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(
|
||||
void *dst, const void *src, size_t n,
|
||||
std::function<void()> callback) const {
|
||||
static const auto read_mode = cl::sycl::access::mode::read;
|
||||
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||
typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
|
||||
read_accessor;
|
||||
if (n == 0) {
|
||||
if (callback) callback();
|
||||
return;
|
||||
}
|
||||
n /= sizeof(buffer_scalar_t);
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
|
||||
buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
|
||||
auto non_deleter = [](buffer_scalar_t *) {};
|
||||
std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
|
||||
cgh.copy(src_acc, s_ptr);
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||
auto e = m_queue.memcpy(dst, src, n);
|
||||
synchronize_and_callback(e, callback);
|
||||
}
|
||||
|
||||
@ -313,257 +188,88 @@ class QueueInterface {
|
||||
/// No callback is required here as both arguments are on the device
|
||||
/// and SYCL can handle the dependency.
|
||||
EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
|
||||
static const auto read_mode = cl::sycl::access::mode::read;
|
||||
static const auto write_mode = cl::sycl::access::mode::discard_write;
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
n /= sizeof(buffer_scalar_t);
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
|
||||
auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
|
||||
cgh.copy(src_acc, dst_acc);
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||
async_synchronize(e);
|
||||
m_queue.memcpy(dst, src, n).wait();
|
||||
}
|
||||
|
||||
/// the memset function.
|
||||
/// No callback is required here as both arguments are on the device
|
||||
/// and SYCL can handle the dependency.
|
||||
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
|
||||
static const auto write_mode = cl::sycl::access::mode::discard_write;
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
// Get a typed range accesser to ensure we fill each byte, in case
|
||||
// `buffer_scalar_t` is not (u)int8_t.
|
||||
auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
|
||||
cgh.fill(dst_acc, static_cast<uint8_t>(c));
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||
async_synchronize(e);
|
||||
m_queue.memset(data, c, n).wait();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||
static const auto write_mode = cl::sycl::access::mode::discard_write;
|
||||
template <typename T>
|
||||
EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
|
||||
if (begin == end) {
|
||||
return;
|
||||
}
|
||||
const ptrdiff_t count = end - begin;
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
|
||||
cgh.fill(dst_acc, value);
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||
async_synchronize(e);
|
||||
}
|
||||
|
||||
/// Get a range accessor to the virtual pointer's device memory. This range
|
||||
/// accessor will allow access to the memory from the pointer to the end of
|
||||
/// the buffer.
|
||||
///
|
||||
/// NOTE: Inside a kernel the range accessor will always be indexed from the
|
||||
/// start of the buffer, so the offset in the accessor is only used by
|
||||
/// methods like handler::copy and will not be available inside a kernel.
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
|
||||
get_range_accessor(const void *ptr) const {
|
||||
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
|
||||
typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
|
||||
typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
|
||||
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
|
||||
auto original_buffer = pMapper.get_buffer(ptr);
|
||||
const ptrdiff_t offset = pMapper.get_offset(ptr);
|
||||
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
|
||||
eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
|
||||
const ptrdiff_t typed_offset = offset / sizeof(T);
|
||||
eigen_assert(typed_offset >= 0);
|
||||
const auto typed_size = original_buffer.get_size() / sizeof(T);
|
||||
auto buffer = original_buffer.template reinterpret<
|
||||
std::remove_const_t<T>>(
|
||||
cl::sycl::range<1>(typed_size));
|
||||
const ptrdiff_t size = buffer.get_count() - typed_offset;
|
||||
eigen_assert(size >= 0);
|
||||
typedef cl::sycl::accessor<std::remove_const_t<T>,
|
||||
1, AcMd, global_access, is_place_holder>
|
||||
placeholder_accessor_t;
|
||||
const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
|
||||
return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
|
||||
cl::sycl::id<1>(typed_offset)),
|
||||
static_cast<size_t>(typed_offset),
|
||||
reinterpret_cast<std::intptr_t>(start_ptr));
|
||||
}
|
||||
|
||||
/// Get a range accessor to the virtual pointer's device memory with a
|
||||
/// specified size.
|
||||
template <cl::sycl::access::mode AcMd, typename Index>
|
||||
EIGEN_STRONG_INLINE cl::sycl::accessor<
|
||||
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
|
||||
get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
|
||||
const Index n_bytes) const {
|
||||
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||
eigen_assert(n_bytes >= 0);
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
auto buffer = pMapper.get_buffer(ptr);
|
||||
const ptrdiff_t offset = pMapper.get_offset(ptr);
|
||||
eigen_assert(offset >= 0);
|
||||
eigen_assert(offset + n_bytes <= buffer.get_size());
|
||||
return buffer.template get_access<AcMd, global_access>(
|
||||
cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
|
||||
}
|
||||
|
||||
/// Get a range accessor to the virtual pointer's device memory with a
|
||||
/// specified type and count.
|
||||
template <cl::sycl::access::mode AcMd, typename T, typename Index>
|
||||
EIGEN_STRONG_INLINE cl::sycl::accessor<
|
||||
T, 1, AcMd, cl::sycl::access::target::global_buffer>
|
||||
get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
|
||||
const Index count) const {
|
||||
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||
eigen_assert(count >= 0);
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
auto buffer = pMapper.get_buffer(ptr);
|
||||
const ptrdiff_t offset = pMapper.get_offset(ptr);
|
||||
eigen_assert(offset >= 0);
|
||||
|
||||
// Technically we should create a subbuffer for the desired range,
|
||||
// then reinterpret that. However, I was not able to get changes to reflect
|
||||
// in the original buffer (only the subbuffer and reinterpretted buffer).
|
||||
// This current implementation now has the restriction that the buffer
|
||||
// offset and original buffer size must be a multiple of sizeof(T).
|
||||
// Note that get_range_accessor(void*) currently has the same restriction.
|
||||
//
|
||||
// auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer,
|
||||
// cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
|
||||
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
|
||||
eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
|
||||
const ptrdiff_t typed_offset = offset / sizeof(T);
|
||||
const size_t typed_size = buffer.get_size() / sizeof(T);
|
||||
auto reint = buffer.template reinterpret<
|
||||
std::remove_const_t<T>>(
|
||||
cl::sycl::range<1>(typed_size));
|
||||
return reint.template get_access<AcMd, global_access>(
|
||||
cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
|
||||
}
|
||||
|
||||
/// Creation of sycl accessor for a buffer. This function first tries to find
|
||||
/// the buffer in the buffer_map. If found it gets the accessor from it, if
|
||||
/// not, the function then adds an entry by creating a sycl buffer for that
|
||||
/// particular pointer.
|
||||
template <cl::sycl::access::mode AcMd>
|
||||
EIGEN_STRONG_INLINE cl::sycl::accessor<
|
||||
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
|
||||
get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
return pMapper.get_buffer(ptr)
|
||||
.template get_access<AcMd, cl::sycl::access::target::global_buffer>(
|
||||
cgh);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
|
||||
const void *ptr) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
return pMapper.get_buffer(ptr);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
|
||||
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||
return pMapper.get_offset(ptr);
|
||||
const size_t count = end - begin;
|
||||
m_queue.fill(begin, value, count).wait();
|
||||
}
|
||||
|
||||
template <typename OutScalar, typename sycl_kernel, typename Lhs,
|
||||
typename Rhs, typename OutPtr, typename Range, typename Index,
|
||||
typename... T>
|
||||
EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
|
||||
const Rhs &rhs, OutPtr outptr,
|
||||
Range thread_range,
|
||||
Index scratchSize,
|
||||
T... var) const {
|
||||
EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(
|
||||
const Lhs &lhs, const Rhs &rhs, OutPtr outptr, Range thread_range,
|
||||
Index scratchSize, T... var) const {
|
||||
auto kernel_functor = [=](cl::sycl::handler &cgh) {
|
||||
// binding the placeholder accessors to a commandgroup handler
|
||||
lhs.bind(cgh);
|
||||
rhs.bind(cgh);
|
||||
outptr.bind(cgh);
|
||||
typedef cl::sycl::accessor<OutScalar, 1,
|
||||
cl::sycl::access::mode::read_write,
|
||||
cl::sycl::access::target::local>
|
||||
LocalAccessor;
|
||||
|
||||
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
|
||||
cgh.parallel_for(
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
program().template get_kernel<sycl_kernel>(),
|
||||
#endif
|
||||
thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
|
||||
cgh.parallel_for(thread_range,
|
||||
sycl_kernel(scratch, lhs, rhs, outptr, var...));
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
|
||||
async_synchronize(e);
|
||||
|
||||
return m_queue.submit(kernel_functor);
|
||||
}
|
||||
|
||||
template <typename OutScalar, typename sycl_kernel, typename InPtr,
|
||||
typename OutPtr, typename Range, typename Index, typename... T>
|
||||
EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
|
||||
OutPtr &outptr,
|
||||
Range thread_range,
|
||||
Index scratchSize,
|
||||
T... var) const {
|
||||
EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr,
|
||||
OutPtr &outptr,
|
||||
Range thread_range,
|
||||
Index scratchSize,
|
||||
T... var) const {
|
||||
auto kernel_functor = [=](cl::sycl::handler &cgh) {
|
||||
// binding the placeholder accessors to a commandgroup handler
|
||||
inptr.bind(cgh);
|
||||
outptr.bind(cgh);
|
||||
typedef cl::sycl::accessor<OutScalar, 1,
|
||||
cl::sycl::access::mode::read_write,
|
||||
cl::sycl::access::target::local>
|
||||
LocalAccessor;
|
||||
|
||||
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
|
||||
cgh.parallel_for(
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
program().template get_kernel<sycl_kernel>(),
|
||||
#endif
|
||||
thread_range, sycl_kernel(scratch, inptr, outptr, var...));
|
||||
cgh.parallel_for(thread_range,
|
||||
sycl_kernel(scratch, inptr, outptr, var...));
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
|
||||
async_synchronize(e);
|
||||
return m_queue.submit(kernel_functor);
|
||||
}
|
||||
|
||||
template <typename OutScalar, typename sycl_kernel, typename InPtr,
|
||||
typename Range, typename Index, typename... T>
|
||||
EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
|
||||
Range thread_range,
|
||||
Index scratchSize,
|
||||
T... var) const {
|
||||
template <typename OutScalar, typename sycl_kernel, typename InPtr,
|
||||
typename Range, typename Index, typename... T>
|
||||
EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(
|
||||
const InPtr &inptr, Range thread_range, Index scratchSize,
|
||||
T... var) const {
|
||||
auto kernel_functor = [=](cl::sycl::handler &cgh) {
|
||||
// binding the placeholder accessors to a commandgroup handler
|
||||
inptr.bind(cgh);
|
||||
typedef cl::sycl::accessor<OutScalar, 1,
|
||||
cl::sycl::access::mode::read_write,
|
||||
cl::sycl::access::target::local>
|
||||
LocalAccessor;
|
||||
|
||||
LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
|
||||
cgh.parallel_for(
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
program().template get_kernel<sycl_kernel>(),
|
||||
#endif
|
||||
thread_range, sycl_kernel(scratch, inptr, var...));
|
||||
cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
|
||||
};
|
||||
cl::sycl::event e;
|
||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
|
||||
async_synchronize(e);
|
||||
}
|
||||
|
||||
return m_queue.submit(kernel_functor);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void synchronize() const {
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
@ -573,14 +279,6 @@ class QueueInterface {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
|
||||
set_latest_event(e);
|
||||
#ifndef EIGEN_SYCL_ASYNC_EXECUTION
|
||||
synchronize();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Index>
|
||||
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
|
||||
Index &rng, Index &GRange) const {
|
||||
@ -776,48 +474,9 @@ class QueueInterface {
|
||||
return !exception_caught_;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
|
||||
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
|
||||
std::lock_guard<std::mutex> lock(event_mutex_);
|
||||
return latest_events_[std::this_thread::get_id()];
|
||||
#else
|
||||
eigen_assert(false);
|
||||
return cl::sycl::event();
|
||||
#endif
|
||||
}
|
||||
|
||||
// destructor
|
||||
~QueueInterface() {
|
||||
pMapper.clear();
|
||||
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
|
||||
scratch_buffers.clear();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> get_null_accessor()
|
||||
const {
|
||||
eigen_assert(null_buff_simulator.get_size() % sizeof(T) == 0 && "The null buffer size must be a multiple of sizeof(T)");
|
||||
const ptrdiff_t typed_size = null_buff_simulator.get_size() / sizeof(T);
|
||||
eigen_assert(typed_size > 0);
|
||||
auto typed_null_buff =
|
||||
null_buff_simulator.template reinterpret<T>(cl::sycl::range<1>(typed_size));
|
||||
return TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>(typed_null_buff);
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
|
||||
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
|
||||
std::lock_guard<std::mutex> lock(event_mutex_);
|
||||
latest_events_[std::this_thread::get_id()] = e;
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(e);
|
||||
#endif
|
||||
}
|
||||
|
||||
void synchronize_and_callback(cl::sycl::event e,
|
||||
const std::function<void()> &callback) const {
|
||||
set_latest_event(e);
|
||||
if (callback) {
|
||||
auto callback_ = [=]() {
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
@ -850,29 +509,8 @@ class QueueInterface {
|
||||
|
||||
/// class members:
|
||||
bool exception_caught_ = false;
|
||||
|
||||
mutable std::mutex pmapper_mutex_;
|
||||
|
||||
#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
|
||||
mutable std::mutex event_mutex_;
|
||||
mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
|
||||
#endif
|
||||
|
||||
/// std::map is the container used to make sure that we create only one buffer
|
||||
/// per pointer. The lifespan of the buffer now depends on the lifespan of
|
||||
/// SyclDevice. If a non-read-only pointer is needed to be accessed on the
|
||||
/// host we should manually deallocate it.
|
||||
mutable TensorSycl::internal::PointerMapper pMapper;
|
||||
cl::sycl::buffer<uint8_t, 1> null_buff_simulator = cl::sycl::buffer<uint8_t, 1>(cl::sycl::range<1>(128));
|
||||
#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
|
||||
mutable std::unordered_set<void *> scratch_buffers;
|
||||
#endif
|
||||
/// sycl queue
|
||||
mutable cl::sycl::queue m_queue;
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
mutable cl::sycl::program m_prog;
|
||||
#endif
|
||||
|
||||
/// The thread pool is used to wait on events and call callbacks
|
||||
/// asynchronously
|
||||
mutable Eigen::ThreadPool m_thread_pool;
|
||||
@ -896,32 +534,6 @@ struct SyclDeviceBase {
|
||||
struct SyclDevice : public SyclDeviceBase {
|
||||
explicit SyclDevice(const QueueInterface *queue_stream)
|
||||
: SyclDeviceBase(queue_stream) {}
|
||||
|
||||
template <typename scalar_t>
|
||||
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, scalar_t>
|
||||
get_null_accessor() const {
|
||||
return queue_stream()->template get_null_accessor<scalar_t>();
|
||||
}
|
||||
// this is the accessor used to construct the evaluator
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
|
||||
get_range_accessor(const void *ptr) const {
|
||||
return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
|
||||
}
|
||||
|
||||
// get sycl accessor
|
||||
template <cl::sycl::access::mode AcMd>
|
||||
EIGEN_STRONG_INLINE cl::sycl::accessor<
|
||||
buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
|
||||
get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
|
||||
return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
|
||||
}
|
||||
|
||||
/// Accessing the created sycl device buffer for the device pointer
|
||||
EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
|
||||
const void *ptr) const {
|
||||
return queue_stream()->get_sycl_buffer(ptr);
|
||||
}
|
||||
|
||||
/// This is used to prepare the number of threads and also the number of
|
||||
/// threads per block for sycl kernels
|
||||
@ -966,40 +578,14 @@ struct SyclDevice : public SyclDeviceBase {
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
|
||||
queue_stream()->deallocate_temp(buffer);
|
||||
}
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(
|
||||
const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const {
|
||||
queue_stream()->deallocate_temp(buffer);
|
||||
}
|
||||
EIGEN_STRONG_INLINE void deallocate_all() const {
|
||||
queue_stream()->deallocate_all();
|
||||
}
|
||||
|
||||
template <typename data_t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write, data_t>
|
||||
get(data_t *data) const {
|
||||
return queue_stream()->get(data);
|
||||
}
|
||||
template <typename data_t>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
|
||||
TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
|
||||
data_t>
|
||||
data) const {
|
||||
return queue_stream()->get(data);
|
||||
}
|
||||
|
||||
/// attach existing buffer
|
||||
EIGEN_STRONG_INLINE void *attach_buffer(
|
||||
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
|
||||
return queue_stream()->attach_buffer(buf);
|
||||
}
|
||||
/// detach buffer
|
||||
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
|
||||
queue_stream()->detach_buffer(p);
|
||||
}
|
||||
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
|
||||
return queue_stream()->get_offset(ptr);
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
|
||||
return data;
|
||||
}
|
||||
|
||||
// some runtime conditions that can be applied here
|
||||
@ -1029,19 +615,14 @@ struct SyclDevice : public SyclDeviceBase {
|
||||
queue_stream()->memset(data, c, n);
|
||||
}
|
||||
/// the fill function
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||
template <typename T>
|
||||
EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
|
||||
queue_stream()->fill(begin, end, value);
|
||||
}
|
||||
/// returning the sycl queue
|
||||
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
|
||||
return queue_stream()->sycl_queue();
|
||||
}
|
||||
#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
|
||||
EIGEN_STRONG_INLINE cl::sycl::program &program() const {
|
||||
return queue_stream()->program();
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
|
||||
|
||||
@ -1081,13 +662,6 @@ struct SyclDevice : public SyclDeviceBase {
|
||||
EIGEN_STRONG_INLINE void synchronize() const {
|
||||
queue_stream()->synchronize();
|
||||
}
|
||||
EIGEN_STRONG_INLINE void async_synchronize(
|
||||
cl::sycl::event e = cl::sycl::event()) const {
|
||||
queue_stream()->async_synchronize(e);
|
||||
}
|
||||
EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
|
||||
return queue_stream()->get_latest_event();
|
||||
}
|
||||
|
||||
// This function checks if the runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
@ -1109,20 +683,20 @@ struct SyclDevice : public SyclDeviceBase {
|
||||
return queue_stream()->getDeviceVendor();
|
||||
}
|
||||
template <typename OutScalar, typename KernelType, typename... T>
|
||||
EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
|
||||
queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
|
||||
var...);
|
||||
EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
|
||||
return queue_stream()
|
||||
->template binary_kernel_launcher<OutScalar, KernelType>(var...);
|
||||
}
|
||||
template <typename OutScalar, typename KernelType, typename... T>
|
||||
EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
|
||||
queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
|
||||
var...);
|
||||
EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
|
||||
return queue_stream()
|
||||
->template unary_kernel_launcher<OutScalar, KernelType>(var...);
|
||||
}
|
||||
|
||||
template <typename OutScalar, typename KernelType, typename... T>
|
||||
EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
|
||||
queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
|
||||
var...);
|
||||
EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
|
||||
return queue_stream()
|
||||
->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
|
||||
}
|
||||
};
|
||||
} // end namespace Eigen
|
||||
|
@ -217,13 +217,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
|
||||
ArgType expression() const { return m_expression; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
m_buffer.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
private:
|
||||
|
@ -183,12 +183,6 @@ struct TensorEvaluator
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_data.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
protected:
|
||||
EvaluatorPointerType m_data;
|
||||
Dimensions m_dims;
|
||||
@ -215,13 +209,7 @@ Eigen::half loadConstant(const Eigen::half* address) {
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
|
||||
}
|
||||
#endif
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// overload of load constant should be implemented here based on range access
|
||||
template <cl::sycl::access::mode AcMd, typename T>
|
||||
T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
|
||||
return *address;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// Default evaluator for rvalues
|
||||
@ -338,12 +326,7 @@ struct TensorEvaluator<const Derived, Device>
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_data.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
EvaluatorPointerType m_data;
|
||||
Dimensions m_dims;
|
||||
@ -425,13 +408,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_argImpl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
const NullaryOp m_functor;
|
||||
TensorEvaluator<ArgType, Device> m_argImpl;
|
||||
@ -538,14 +514,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
|
||||
m_argImpl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
private:
|
||||
const Device EIGEN_DEVICE_REF m_device;
|
||||
const UnaryOp m_functor;
|
||||
@ -678,13 +646,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_leftImpl.bind(cgh);
|
||||
m_rightImpl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
const Device EIGEN_DEVICE_REF m_device;
|
||||
const BinaryOp m_functor;
|
||||
@ -793,15 +754,6 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_arg1Impl.bind(cgh);
|
||||
m_arg2Impl.bind(cgh);
|
||||
m_arg3Impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
const TernaryOp m_functor;
|
||||
TensorEvaluator<Arg1Type, Device> m_arg1Impl;
|
||||
|
@ -743,7 +743,7 @@ class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
|
||||
evaluator,
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
|
||||
cl::sycl::range<1>(tileSize)),
|
||||
Index(1), range);
|
||||
Index(1), range).wait();
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
|
@ -207,12 +207,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_data.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
|
||||
|
@ -219,13 +219,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
EvaluatorPointerType data() const { return m_buffer; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_buffer.bind(cgh);
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const ArgType m_op;
|
||||
|
@ -107,22 +107,6 @@ struct GpuDevice;
|
||||
struct SyclDevice;
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
|
||||
template <typename T> struct MakeSYCLPointer {
|
||||
typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
|
||||
constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
|
||||
return data;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
|
||||
template <typename T>
|
||||
struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
|
||||
|
||||
namespace TensorSycl {
|
||||
namespace internal{
|
||||
template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
|
||||
|
@ -266,11 +266,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
|
||||
|
@ -507,14 +507,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
|
||||
|
@ -228,13 +228,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
|
@ -123,13 +123,6 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
@ -221,12 +221,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
NewDimensions m_dimensions;
|
||||
@ -655,12 +649,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
@ -1004,12 +992,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
|
||||
return NULL;
|
||||
}
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
|
@ -485,13 +485,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
struct BlockIteratorState {
|
||||
BlockIteratorState()
|
||||
|
@ -269,13 +269,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
|
@ -541,24 +541,6 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
|
||||
template<typename ArgType, typename Device>
|
||||
struct TensorReductionEvaluatorBase;
|
||||
|
||||
namespace internal {
|
||||
namespace reduction {
|
||||
|
||||
template <typename CoeffReturnType, typename Device>
|
||||
EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
|
||||
typename std::enable_if<Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type& dev) {
|
||||
return (dev.template get_null_accessor<CoeffReturnType>());
|
||||
}
|
||||
|
||||
template <typename CoeffReturnType, typename Device>
|
||||
EIGEN_ALWAYS_INLINE typename StorageMemory<CoeffReturnType, Device>::Type get_null_value(
|
||||
typename std::enable_if<!Eigen::internal::is_same<Device, Eigen::SyclDevice>::value, const Device>::type&) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
}// end namespace reduction
|
||||
} // end namespace internal
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
|
||||
struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
|
||||
@ -623,7 +605,7 @@ static constexpr bool RunningOnGPU = false;
|
||||
EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device),
|
||||
m_reducer(op.reducer()),
|
||||
m_result(internal::reduction::get_null_value<CoeffReturnType, Device>(device)),
|
||||
m_result(NULL),
|
||||
m_device(device) {
|
||||
EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
|
||||
@ -911,13 +893,6 @@ static constexpr bool RunningOnGPU = false;
|
||||
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
|
||||
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
m_result.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
template <int, typename, typename> friend struct internal::GenericDimReducer;
|
||||
|
@ -96,8 +96,8 @@ struct SecondStepFullReducer {
|
||||
// algorithm must be changed if the number of reduce per thread in the
|
||||
// second step is greater than 1. Otherwise, the result will be wrong.
|
||||
const Index localid = itemID.get_local_id(0);
|
||||
auto aInPtr = aI.get_pointer() + localid;
|
||||
auto aOutPtr = outAcc.get_pointer();
|
||||
auto aInPtr = aI + localid;
|
||||
auto aOutPtr = outAcc;
|
||||
CoeffReturnType *scratchptr = scratch.get_pointer();
|
||||
CoeffReturnType accumulator = *aInPtr;
|
||||
|
||||
@ -146,7 +146,7 @@ class FullReductionKernelFunctor {
|
||||
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
|
||||
const cl::sycl::nd_item<1> &itemID) const {
|
||||
auto output_ptr = final_output.get_pointer();
|
||||
auto output_ptr = final_output;
|
||||
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
|
||||
Index globalid = itemID.get_global_id(0);
|
||||
Index localid = itemID.get_local_id(0);
|
||||
@ -185,7 +185,7 @@ class FullReductionKernelFunctor {
|
||||
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
|
||||
const cl::sycl::nd_item<1> &itemID) const {
|
||||
auto output_ptr = final_output.get_pointer();
|
||||
auto output_ptr = final_output;
|
||||
Index globalid = itemID.get_global_id(0);
|
||||
Index localid = itemID.get_local_id(0);
|
||||
// vectorizable parts
|
||||
@ -232,7 +232,7 @@ class GenericNondeterministicReducer {
|
||||
//This is to bypass the statefull condition in Eigen meanReducer
|
||||
Op non_const_functor;
|
||||
std::memcpy(&non_const_functor, &functor, sizeof (Op));
|
||||
auto output_accessor_ptr = output_accessor.get_pointer();
|
||||
auto output_accessor_ptr = output_accessor;
|
||||
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
|
||||
if (globalid < range) {
|
||||
CoeffReturnType accum = functor.initialize();
|
||||
@ -313,9 +313,9 @@ struct PartialReductionKernel {
|
||||
|
||||
Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
|
||||
const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
|
||||
auto scratchPtr = scratch.get_pointer().get();
|
||||
CoeffReturnType* scratchPtr = scratch.get_pointer();
|
||||
auto outPtr =
|
||||
output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
|
||||
output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
|
||||
CoeffReturnType accumulator = op.initialize();
|
||||
|
||||
element_wise_reduce(globalRId, globalPId, accumulator);
|
||||
@ -387,7 +387,7 @@ struct SecondStepPartialReduction {
|
||||
|
||||
if (globalId >= num_coeffs_to_preserve) return;
|
||||
|
||||
auto in_ptr = input_accessor.get_pointer() + globalId;
|
||||
auto in_ptr = input_accessor + globalId;
|
||||
|
||||
OutScalar accumulator = op.initialize();
|
||||
// num_coeffs_to_reduce is not bigger that 256
|
||||
@ -395,7 +395,7 @@ struct SecondStepPartialReduction {
|
||||
op.reduce(*in_ptr, &accumulator);
|
||||
in_ptr += num_coeffs_to_preserve;
|
||||
}
|
||||
output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
|
||||
output_accessor[globalId] = op.finalize(accumulator);
|
||||
}
|
||||
}; // namespace internal
|
||||
|
||||
@ -453,21 +453,18 @@ struct PartialReducerLauncher {
|
||||
EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
|
||||
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
|
||||
self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
|
||||
num_coeffs_to_reduce);
|
||||
|
||||
num_coeffs_to_reduce).wait();
|
||||
typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
|
||||
SecondStepPartialReductionKernel;
|
||||
|
||||
dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
|
||||
temp_accessor, output,
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
|
||||
reducer, num_coeffs_to_preserve, rNumGroups);
|
||||
|
||||
reducer, num_coeffs_to_preserve, rNumGroups).wait();
|
||||
self.device().deallocate_temp(temp_pointer);
|
||||
} else {
|
||||
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
|
||||
self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
|
||||
num_coeffs_to_reduce);
|
||||
num_coeffs_to_reduce).wait();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -512,20 +509,19 @@ struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
|
||||
static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
|
||||
typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
|
||||
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
|
||||
local_range, inputSize, reducer);
|
||||
|
||||
local_range, inputSize, reducer).wait();
|
||||
typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
|
||||
EvaluatorPointerType, Index, local_range>
|
||||
GenericRKernel;
|
||||
dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
|
||||
tmp_global_accessor, data,
|
||||
cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
|
||||
reducer);
|
||||
|
||||
reducer).wait();
|
||||
dev.deallocate_temp(temp_pointer);
|
||||
} else {
|
||||
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
|
||||
reducer);
|
||||
reducer).wait();
|
||||
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -574,7 +570,7 @@ struct GenericReducer<Self, Op, Eigen::SyclDevice> {
|
||||
dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
|
||||
TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
|
||||
self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
|
||||
reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
|
||||
reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)).wait();
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
@ -368,13 +368,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_strides;
|
||||
|
@ -507,13 +507,6 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
m_output.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
protected:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Device EIGEN_DEVICE_REF m_device;
|
||||
|
@ -89,8 +89,8 @@ struct ScanKernelFunctor {
|
||||
|
||||
LocalAccessor scratch;
|
||||
Evaluator dev_eval;
|
||||
OutAccessor out_accessor;
|
||||
OutAccessor temp_accessor;
|
||||
OutAccessor out_ptr;
|
||||
OutAccessor tmp_ptr;
|
||||
const ScanParameters<Index> scanParameters;
|
||||
Op accumulator;
|
||||
const bool inclusive;
|
||||
@ -100,8 +100,8 @@ struct ScanKernelFunctor {
|
||||
const bool inclusive_)
|
||||
: scratch(scratch_),
|
||||
dev_eval(dev_eval_),
|
||||
out_accessor(out_accessor_),
|
||||
temp_accessor(temp_accessor_),
|
||||
out_ptr(out_accessor_),
|
||||
tmp_ptr(temp_accessor_),
|
||||
scanParameters(scanParameters_),
|
||||
accumulator(accumulator_),
|
||||
inclusive(inclusive_) {}
|
||||
@ -131,9 +131,7 @@ struct ScanKernelFunctor {
|
||||
first_step_inclusive_Operation(InclusiveOp) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
auto out_ptr = out_accessor.get_pointer();
|
||||
auto tmp_ptr = temp_accessor.get_pointer();
|
||||
auto scratch_ptr = scratch.get_pointer().get();
|
||||
|
||||
|
||||
for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
|
||||
Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
|
||||
@ -190,7 +188,7 @@ struct ScanKernelFunctor {
|
||||
}
|
||||
private_offset *= 2;
|
||||
}
|
||||
scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
|
||||
scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
|
||||
private_scan[PacketSize - 1 + packetIndex];
|
||||
private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
|
||||
// traverse down tree & build scan
|
||||
@ -219,9 +217,9 @@ struct ScanKernelFunctor {
|
||||
Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
|
||||
Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
|
||||
CoeffReturnType accum = accumulator.initialize();
|
||||
accumulator.reduce(scratch_ptr[ai], &accum);
|
||||
accumulator.reduce(scratch_ptr[bi], &accum);
|
||||
scratch_ptr[bi] = accumulator.finalize(accum);
|
||||
accumulator.reduce(scratch[ai], &accum);
|
||||
accumulator.reduce(scratch[bi], &accum);
|
||||
scratch[bi] = accumulator.finalize(accum);
|
||||
}
|
||||
offset *= 2;
|
||||
}
|
||||
@ -234,10 +232,10 @@ struct ScanKernelFunctor {
|
||||
scanParameters.non_scan_size +
|
||||
group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
|
||||
block_id;
|
||||
tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
|
||||
tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
|
||||
}
|
||||
// clear the last element
|
||||
scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
|
||||
scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
|
||||
}
|
||||
// traverse down tree & build scan
|
||||
for (Index d = 1; d < scratch_stride; d *= 2) {
|
||||
@ -248,10 +246,10 @@ struct ScanKernelFunctor {
|
||||
Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
|
||||
Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
|
||||
CoeffReturnType accum = accumulator.initialize();
|
||||
accumulator.reduce(scratch_ptr[ai], &accum);
|
||||
accumulator.reduce(scratch_ptr[bi], &accum);
|
||||
scratch_ptr[ai] = scratch_ptr[bi];
|
||||
scratch_ptr[bi] = accumulator.finalize(accum);
|
||||
accumulator.reduce(scratch[ai], &accum);
|
||||
accumulator.reduce(scratch[bi], &accum);
|
||||
scratch[ai] = scratch[bi];
|
||||
scratch[bi] = accumulator.finalize(accum);
|
||||
}
|
||||
}
|
||||
// Synchronise
|
||||
@ -262,7 +260,7 @@ struct ScanKernelFunctor {
|
||||
EIGEN_UNROLL_LOOP
|
||||
for (Index i = 0; i < PacketSize; i++) {
|
||||
CoeffReturnType accum = private_scan[packetIndex + i];
|
||||
accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
|
||||
accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
|
||||
private_scan[packetIndex + i] = accumulator.finalize(accum);
|
||||
}
|
||||
}
|
||||
@ -294,22 +292,20 @@ struct ScanAdjustmentKernelFunctor {
|
||||
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
|
||||
LocalAccessor;
|
||||
static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
|
||||
InAccessor in_accessor;
|
||||
OutAccessor out_accessor;
|
||||
InAccessor in_ptr;
|
||||
OutAccessor out_ptr;
|
||||
const ScanParameters<Index> scanParameters;
|
||||
Op accumulator;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
|
||||
OutAccessor out_accessor_,
|
||||
const ScanParameters<Index> scanParameters_,
|
||||
Op accumulator_)
|
||||
: in_accessor(in_accessor_),
|
||||
out_accessor(out_accessor_),
|
||||
: in_ptr(in_accessor_),
|
||||
out_ptr(out_accessor_),
|
||||
scanParameters(scanParameters_),
|
||||
accumulator(accumulator_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
auto in_ptr = in_accessor.get_pointer();
|
||||
auto out_ptr = out_accessor.get_pointer();
|
||||
|
||||
for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
|
||||
Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
|
||||
@ -426,7 +422,7 @@ struct SYCLAdjustBlockOffset {
|
||||
AdjustFuctor;
|
||||
dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
|
||||
scan_info.max_elements_per_block,
|
||||
scan_info.get_scan_parameter(), accumulator);
|
||||
scan_info.get_scan_parameter(), accumulator).wait();
|
||||
}
|
||||
};
|
||||
|
||||
@ -449,7 +445,7 @@ struct ScanLauncher_impl {
|
||||
typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
|
||||
dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
|
||||
in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
|
||||
scan_info.get_scan_parameter(), accumulator, inclusive);
|
||||
scan_info.get_scan_parameter(), accumulator, inclusive).wait();
|
||||
|
||||
if (scan_info.block_size > 1) {
|
||||
ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
|
||||
|
@ -290,12 +290,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
|
||||
Index input_index,
|
||||
|
@ -223,12 +223,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
|
@ -256,13 +256,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
protected:
|
||||
// Given the output index, finds the first index in the input tensor used to compute the trace
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
|
||||
|
@ -535,12 +535,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
|
||||
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
|
@ -148,9 +148,6 @@ if(EIGEN_TEST_SYCL)
|
||||
if(EIGEN_SYCL_REG_N)
|
||||
add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
|
||||
endif()
|
||||
if(EIGEN_SYCL_USE_PROGRAM_CLASS)
|
||||
add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
|
||||
endif()
|
||||
if(EIGEN_SYCL_ASYNC_EXECUTION)
|
||||
add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
|
||||
endif()
|
||||
|
@ -23,15 +23,6 @@
|
||||
#include <stdint.h>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef SYCL_COMPILER_IS_DPCPP
|
||||
template <typename T>
|
||||
struct cl::sycl::is_device_copyable<
|
||||
OffByOneScalar<T>,
|
||||
std::enable_if_t<!(!std::is_trivially_copyable<OffByOneScalar<T>>::value &&
|
||||
(std::is_const_v<OffByOneScalar<T>> || std::is_volatile_v<OffByOneScalar<T>>))>>
|
||||
: std::true_type {};
|
||||
#endif
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
||||
IndexType sizeDim1 = 100;
|
||||
@ -39,7 +30,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
||||
Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
|
||||
Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
|
||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
|
||||
|
||||
|
||||
// memset
|
||||
memset(in1.data(), 1, in1.size() * sizeof(DataType));
|
||||
sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
|
||||
@ -47,7 +38,7 @@ void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
||||
for (IndexType i=0; i<in.size(); i++) {
|
||||
VERIFY_IS_EQUAL(in(i), in1(i));
|
||||
}
|
||||
|
||||
|
||||
// fill
|
||||
DataType value = DataType(7);
|
||||
std::fill_n(in1.data(), in1.size(), value);
|
||||
@ -77,31 +68,6 @@ void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
|
||||
sycl_device.deallocate(gpu_data);
|
||||
}
|
||||
|
||||
template<typename DataType, int DataLayout, typename IndexType>
|
||||
void test_device_attach_buffer(const Eigen::SyclDevice &sycl_device) {
|
||||
IndexType sizeDim1 = 100;
|
||||
|
||||
array<IndexType, 1> tensorRange = {{sizeDim1}};
|
||||
Tensor<DataType, 1, DataLayout, IndexType> in(tensorRange);
|
||||
|
||||
cl::sycl::buffer<buffer_scalar_t, 1> buffer(cl::sycl::range<1>(sizeDim1 * sizeof(DataType)));
|
||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.attach_buffer(buffer));
|
||||
|
||||
// fill
|
||||
DataType value = DataType(7);
|
||||
std::fill_n(in.data(), in.size(), value);
|
||||
sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
|
||||
|
||||
// Check that buffer is filled with the correct value.
|
||||
auto reint = buffer.reinterpret<DataType>(cl::sycl::range<1>(sizeDim1));
|
||||
auto access = reint.template get_access<cl::sycl::access::mode::read>();
|
||||
for (IndexType i=0; i<in.size(); i++) {
|
||||
VERIFY_IS_EQUAL(in(i), access[i]);
|
||||
}
|
||||
|
||||
sycl_device.detach_buffer(gpu_in_data);
|
||||
}
|
||||
|
||||
template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
|
||||
std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
|
||||
QueueInterface queueInterface(d);
|
||||
@ -112,7 +78,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
|
||||
//test_device_exceptions<DataType, RowMajor>(sycl_device);
|
||||
/// this test throw an exception. enable it if you want to see the exception
|
||||
//test_device_exceptions<DataType, ColMajor>(sycl_device);
|
||||
test_device_attach_buffer<DataType, ColMajor, int64_t>(sycl_device);
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user