mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 04:35:57 +08:00
Fixing LLVM error on TensorMorphingSycl.h on GPU; fixing int64_t crash for tensor_broadcast_sycl on GPU; adding get_sycl_supported_devices() on syclDevice.h.
This commit is contained in:
parent
b8cc5635d5
commit
7318daf887
@ -400,10 +400,12 @@
|
|||||||
// Does the compiler support variadic templates?
|
// Does the compiler support variadic templates?
|
||||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||||
&& ( defined(__SYCL_DEVICE_ONLY__) || !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
|
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
|
||||||
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
||||||
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
||||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||||
|
#elif EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) && defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||||
#else
|
#else
|
||||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 0
|
#define EIGEN_HAS_VARIADIC_TEMPLATES 0
|
||||||
#endif
|
#endif
|
||||||
|
@ -82,6 +82,8 @@ typedef unsigned __int64 uint64_t;
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// tuple construction
|
||||||
|
#include "src/Tensor/TensorSyclTuple.h"
|
||||||
#include "src/Tensor/TensorMacros.h"
|
#include "src/Tensor/TensorMacros.h"
|
||||||
#include "src/Tensor/TensorForwardDeclarations.h"
|
#include "src/Tensor/TensorForwardDeclarations.h"
|
||||||
#include "src/Tensor/TensorMeta.h"
|
#include "src/Tensor/TensorMeta.h"
|
||||||
|
@ -17,6 +17,23 @@
|
|||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
|
auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
|
||||||
|
auto devices = cl::sycl::device::get_devices();
|
||||||
|
std::vector<cl::sycl::device>::iterator it =devices.begin();
|
||||||
|
while(it!=devices.end()) {
|
||||||
|
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
||||||
|
auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
|
||||||
|
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
||||||
|
if((*it).is_cpu() && s.find("amd")!=std::string::npos){
|
||||||
|
it=devices.erase(it);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("Device size %ld\n", devices.size());
|
||||||
|
return devices;
|
||||||
|
}
|
||||||
#define ConvertToActualTypeSycl(T, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<T>::pointer_t>((&(*buf_acc.get_pointer())))
|
#define ConvertToActualTypeSycl(T, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<T>::pointer_t>((&(*buf_acc.get_pointer())))
|
||||||
|
|
||||||
struct QueueInterface {
|
struct QueueInterface {
|
||||||
@ -109,27 +126,6 @@ struct QueueInterface {
|
|||||||
~QueueInterface() { buffer_map.clear(); }
|
~QueueInterface() { buffer_map.clear(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T> class MemCopyFunctor {
|
|
||||||
public:
|
|
||||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
|
|
||||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
|
||||||
MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
|
|
||||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
|
||||||
auto src_ptr = ConvertToActualTypeSycl(T, m_src_acc);
|
|
||||||
auto dst_ptr = ConvertToActualTypeSycl(T, m_dst_acc);
|
|
||||||
auto globalid = itemID.get_global_linear_id();
|
|
||||||
if (globalid < m_rng) {
|
|
||||||
dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
read_accessor m_src_acc;
|
|
||||||
write_accessor m_dst_acc;
|
|
||||||
size_t m_rng;
|
|
||||||
size_t m_i;
|
|
||||||
size_t m_offset;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct SyclDevice {
|
struct SyclDevice {
|
||||||
// class member.
|
// class member.
|
||||||
QueueInterface* m_queue_stream;
|
QueueInterface* m_queue_stream;
|
||||||
@ -150,16 +146,16 @@ struct SyclDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
|
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
|
||||||
template<typename T>
|
template<typename Index>
|
||||||
EIGEN_STRONG_INLINE void parallel_for_setup(T n, T &tileSize, T &rng, T &GRange) const {
|
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
|
||||||
tileSize =static_cast<T>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
|
tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
|
||||||
rng = n;
|
rng = n;
|
||||||
if (rng==0) rng=static_cast<T>(1);
|
if (rng==0) rng=static_cast<Index>(1);
|
||||||
GRange=rng;
|
GRange=rng;
|
||||||
if (tileSize>GRange) tileSize=GRange;
|
if (tileSize>GRange) tileSize=GRange;
|
||||||
else if(GRange>tileSize){
|
else if(GRange>tileSize){
|
||||||
T xMode = static_cast<T>(GRange % tileSize);
|
Index xMode = static_cast<Index>(GRange % tileSize);
|
||||||
if (xMode != 0) GRange += static_cast<T>(tileSize - xMode);
|
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// allocate device memory
|
/// allocate device memory
|
||||||
@ -188,7 +184,7 @@ struct SyclDevice {
|
|||||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||||
auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
||||||
auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
||||||
});
|
});
|
||||||
sycl_queue().throw_asynchronous();
|
sycl_queue().throw_asynchronous();
|
||||||
}
|
}
|
||||||
@ -219,7 +215,7 @@ struct SyclDevice {
|
|||||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||||
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
||||||
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
||||||
});
|
});
|
||||||
sycl_queue().throw_asynchronous();
|
sycl_queue().throw_asynchronous();
|
||||||
}
|
}
|
||||||
|
@ -68,7 +68,13 @@ struct fixed_size_tensor_index_extraction_helper
|
|||||||
const Dimensions& dimensions)
|
const Dimensions& dimensions)
|
||||||
{
|
{
|
||||||
const Index mult = (index == n-1) ? 1 : 0;
|
const Index mult = (index == n-1) ? 1 : 0;
|
||||||
return array_get<n-1>(dimensions) * mult +
|
return
|
||||||
|
#ifdef EIGEN_USE_SYCL
|
||||||
|
utility::tuple::get<n-1>(dimensions)
|
||||||
|
#else
|
||||||
|
array_get<n-1>(dimensions)
|
||||||
|
#endif
|
||||||
|
* mult +
|
||||||
fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
|
fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -92,6 +98,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
|
|||||||
template <typename std::ptrdiff_t... Indices>
|
template <typename std::ptrdiff_t... Indices>
|
||||||
struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
|
struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
|
||||||
typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
|
typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
|
||||||
|
#ifdef EIGEN_USE_SYCL
|
||||||
|
const decltype(utility::tuple::make_tuple(Indices...)) t= utility::tuple::make_tuple(Indices...);
|
||||||
|
#endif
|
||||||
static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
|
static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
|
||||||
@ -120,7 +129,11 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
|
||||||
|
#ifdef EIGEN_USE_SYCL
|
||||||
|
return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
|
||||||
|
#else
|
||||||
return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
|
return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
@ -21,6 +21,15 @@ namespace Eigen {
|
|||||||
template<typename T> struct MakePointer {
|
template<typename T> struct MakePointer {
|
||||||
typedef T* Type;
|
typedef T* Type;
|
||||||
};
|
};
|
||||||
|
#if defined(EIGEN_USE_SYCL)
|
||||||
|
namespace TensorSycl {
|
||||||
|
namespace internal{
|
||||||
|
template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
|
||||||
|
template <typename T> class MemCopyFunctor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
|
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
|
||||||
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
|
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
|
||||||
|
@ -37,6 +37,8 @@ namespace {
|
|||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
return __clz(val);
|
return __clz(val);
|
||||||
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
return cl::sycl::clz(val);
|
||||||
#elif EIGEN_COMP_MSVC
|
#elif EIGEN_COMP_MSVC
|
||||||
unsigned long index;
|
unsigned long index;
|
||||||
_BitScanReverse(&index, val);
|
_BitScanReverse(&index, val);
|
||||||
@ -53,6 +55,8 @@ namespace {
|
|||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
return __clzll(val);
|
return __clzll(val);
|
||||||
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
return cl::sycl::clz(val);
|
||||||
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
|
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
|
||||||
unsigned long index;
|
unsigned long index;
|
||||||
_BitScanReverse64(&index, val);
|
_BitScanReverse64(&index, val);
|
||||||
@ -88,6 +92,8 @@ namespace {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(__CUDA_ARCH__)
|
||||||
return __umulhi(a, b);
|
return __umulhi(a, b);
|
||||||
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
||||||
#else
|
#else
|
||||||
return (static_cast<uint64_t>(a) * b) >> 32;
|
return (static_cast<uint64_t>(a) * b) >> 32;
|
||||||
#endif
|
#endif
|
||||||
@ -97,6 +103,8 @@ namespace {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(__CUDA_ARCH__)
|
||||||
return __umul64hi(a, b);
|
return __umul64hi(a, b);
|
||||||
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
||||||
#elif defined(__SIZEOF_INT128__)
|
#elif defined(__SIZEOF_INT128__)
|
||||||
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
|
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
|
||||||
return static_cast<uint64_t>(v >> 64);
|
return static_cast<uint64_t>(v >> 64);
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
|
|
||||||
/** \class TensorReduction
|
/** \class TensorReduction
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \ingroup CXX11_Tensor_Module
|
||||||
*
|
*
|
||||||
@ -691,6 +692,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_USE_SYCL)
|
||||||
|
template < typename HostExpr_, typename PlaceHolderExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
|
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
|
||||||
|
|
||||||
// Returns the Index in the input tensor of the first value that needs to be
|
// Returns the Index in the input tensor of the first value that needs to be
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
|
||||||
template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
|
template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
|
||||||
template<typename BufferTOut, typename BufferTIn>
|
template<typename BufferTOut, typename BufferTIn>
|
||||||
static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||||
@ -180,6 +181,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template <typename Self, typename Op>
|
template <typename Self, typename Op>
|
||||||
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||||
|
|
||||||
@ -190,42 +192,50 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
|||||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||||
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||||
|
typedef decltype(functors) FunctorExpr;
|
||||||
typename Self::Index range, GRange, tileSize;
|
typename Self::Index range, GRange, tileSize;
|
||||||
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
|
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
||||||
|
|
||||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||||
/// creating the shared memory for calculating reduction.
|
/// creating the shared memory for calculating reduction.
|
||||||
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
||||||
/// recursively apply reduction on it in order to reduce the whole.
|
/// recursively apply reduction on it in order to reduce the whole.
|
||||||
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
// Dims dims= self.xprDims();
|
||||||
Dims dims= self.xprDims();
|
//Op functor = reducer;
|
||||||
Op functor = reducer;
|
|
||||||
|
|
||||||
|
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
|
||||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||||
// create a tuple of accessors from Evaluator
|
// create a tuple of accessors from Evaluator
|
||||||
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||||
|
typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc;
|
||||||
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
||||||
|
|
||||||
cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
TensorSycl::internal::ReductionFunctor<HostExpr, PlaceHolderExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
|
||||||
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
|
||||||
|
|
||||||
|
|
||||||
|
// [=](cl::sycl::nd_item<1> itemID) {
|
||||||
|
// typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||||
|
// auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||||
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||||
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||||
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||||
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
// const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
||||||
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||||
/// the device_evaluator is detectable and recognisable on the device.
|
/// the device_evaluator is detectable and recognisable on the device.
|
||||||
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
|
// typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
|
||||||
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
// auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||||
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
|
// auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
|
||||||
/// const cast added as a naive solution to solve the qualifier drop error
|
/// const cast added as a naive solution to solve the qualifier drop error
|
||||||
auto globalid=itemID.get_global_linear_id();
|
// auto globalid=itemID.get_global_linear_id();
|
||||||
if (globalid< range) {
|
// if (globalid< range) {
|
||||||
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
|
// typename DeviceSelf::CoeffReturnType accum = functor.initialize();
|
||||||
GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
|
// GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
|
||||||
functor.finalize(accum);
|
// functor.finalize(accum);
|
||||||
output_accessor_ptr[globalid]= accum;
|
// output_accessor_ptr[globalid]= accum;
|
||||||
}
|
// }
|
||||||
});
|
// });
|
||||||
});
|
});
|
||||||
dev.sycl_queue().throw_asynchronous();
|
dev.sycl_queue().throw_asynchronous();
|
||||||
return false;
|
return false;
|
||||||
|
@ -47,8 +47,6 @@ template<typename T> struct GetType<false, T>{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tuple construction
|
|
||||||
#include "TensorSyclTuple.h"
|
|
||||||
|
|
||||||
// counting number of leaf at compile time
|
// counting number of leaf at compile time
|
||||||
#include "TensorSyclLeafCount.h"
|
#include "TensorSyclLeafCount.h"
|
||||||
@ -77,6 +75,8 @@ template<typename T> struct GetType<false, T>{
|
|||||||
|
|
||||||
// kernel execution using fusion
|
// kernel execution using fusion
|
||||||
#include "TensorSyclRun.h"
|
#include "TensorSyclRun.h"
|
||||||
|
//sycl functors
|
||||||
|
#include "TensorSyclFunctors.h"
|
||||||
|
|
||||||
#endif // end of EIGEN_USE_SYCL
|
#endif // end of EIGEN_USE_SYCL
|
||||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
|
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
|
||||||
|
83
unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
Normal file
83
unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Mehdi Goli Codeplay Software Ltd.
|
||||||
|
// Ralph Potter Codeplay Software Ltd.
|
||||||
|
// Luke Iwanski Codeplay Software Ltd.
|
||||||
|
// Contact: eigen@codeplay.com
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// General include header of SYCL target for Tensor Module
|
||||||
|
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
|
||||||
|
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace TensorSycl {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
/// ReductionFunctor
|
||||||
|
template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
|
||||||
|
public:
|
||||||
|
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||||
|
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
|
||||||
|
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
|
||||||
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||||
|
|
||||||
|
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||||
|
auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||||
|
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||||
|
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||||
|
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||||
|
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
||||||
|
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||||
|
/// the device_evaluator is detectable and recognisable on the device.
|
||||||
|
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
|
||||||
|
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||||
|
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
|
||||||
|
/// const cast added as a naive solution to solve the qualifier drop error
|
||||||
|
auto globalid=static_cast<Index>(itemID.get_global_linear_id());
|
||||||
|
if (globalid< range) {
|
||||||
|
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
|
||||||
|
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
|
||||||
|
functor.finalize(accum);
|
||||||
|
output_accessor_ptr[globalid]= accum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
write_accessor output_accessor;
|
||||||
|
FunctorExpr functors;
|
||||||
|
Tuple_of_Acc tuple_of_accessors;
|
||||||
|
Dims dims;
|
||||||
|
Op functor;
|
||||||
|
Index range;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Memcopyfuncdeveicetohost
|
||||||
|
template <typename T> class MemCopyFunctor {
|
||||||
|
public:
|
||||||
|
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
|
||||||
|
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||||
|
MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
|
||||||
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||||
|
auto src_ptr = ConvertToActualTypeSycl(T, m_src_acc);
|
||||||
|
auto dst_ptr = ConvertToActualTypeSycl(T, m_dst_acc);
|
||||||
|
auto globalid = itemID.get_global_linear_id();
|
||||||
|
if (globalid < m_rng) {
|
||||||
|
dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
read_accessor m_src_acc;
|
||||||
|
write_accessor m_dst_acc;
|
||||||
|
size_t m_rng;
|
||||||
|
size_t m_i;
|
||||||
|
size_t m_offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
|
@ -20,6 +20,8 @@
|
|||||||
|
|
||||||
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
||||||
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
||||||
|
#ifdef EIGEN_USE_SYCL
|
||||||
|
|
||||||
namespace utility {
|
namespace utility {
|
||||||
namespace tuple {
|
namespace tuple {
|
||||||
/// \struct StaticIf
|
/// \struct StaticIf
|
||||||
@ -231,4 +233,5 @@ Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
|
|||||||
}
|
}
|
||||||
} // tuple
|
} // tuple
|
||||||
} // utility
|
} // utility
|
||||||
|
#endif //EIGEN_USE_SYCL
|
||||||
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
|
||||||
|
@ -136,21 +136,14 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
|
|||||||
test_broadcast_sycl<DataType, RowMajor, int>(sycl_device);
|
test_broadcast_sycl<DataType, RowMajor, int>(sycl_device);
|
||||||
test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device);
|
test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device);
|
||||||
test_broadcast_sycl<DataType, ColMajor, int>(sycl_device);
|
test_broadcast_sycl<DataType, ColMajor, int>(sycl_device);
|
||||||
|
|
||||||
|
|
||||||
test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||||
test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||||
// the folowing two test breaks the intel gpu and amd gpu driver (cannot create opencl kernel)
|
test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
|
||||||
// test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
|
test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
|
||||||
// test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_cxx11_tensor_broadcast_sycl() {
|
void test_cxx11_tensor_broadcast_sycl() {
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(!device.is_cpu() || s.find("amd")==std::string::npos)
|
|
||||||
CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -264,15 +264,10 @@ static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void test_cxx11_tensor_builtins_sycl() {
|
void test_cxx11_tensor_builtins_sycl() {
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(!device.is_cpu() || s.find("amd")==std::string::npos){
|
|
||||||
QueueInterface queueInterface(device);
|
QueueInterface queueInterface(device);
|
||||||
Eigen::SyclDevice sycl_device(&queueInterface);
|
Eigen::SyclDevice sycl_device(&queueInterface);
|
||||||
CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
|
CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
|
||||||
CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
|
CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -71,11 +71,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
|
|||||||
}
|
}
|
||||||
|
|
||||||
void test_cxx11_tensor_device_sycl() {
|
void test_cxx11_tensor_device_sycl() {
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(!device.is_cpu() || s.find("amd")==std::string::npos)
|
|
||||||
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -70,11 +70,7 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev
|
|||||||
test_forced_eval_sycl<DataType, ColMajor>(sycl_device);
|
test_forced_eval_sycl<DataType, ColMajor>(sycl_device);
|
||||||
}
|
}
|
||||||
void test_cxx11_tensor_forced_eval_sycl() {
|
void test_cxx11_tensor_forced_eval_sycl() {
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(!device.is_cpu() || s.find("amd")==std::string::npos)
|
|
||||||
CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
|
CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -82,12 +82,7 @@ template<typename DataType, typename dev_Selector> void sycl_slicing_test_per_de
|
|||||||
}
|
}
|
||||||
void test_cxx11_tensor_morphing_sycl()
|
void test_cxx11_tensor_morphing_sycl()
|
||||||
{
|
{
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
|
||||||
/// Currentlly it only works on cpu. Adding GPU cause LLVM ERROR in cunstructing OpenCL Kernel at runtime.
|
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(device.is_cpu() && s.find("amd")==std::string::npos)
|
|
||||||
CALL_SUBTEST(sycl_slicing_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_slicing_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -141,11 +141,7 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
|
|||||||
test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
|
test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
|
||||||
}
|
}
|
||||||
void test_cxx11_tensor_reduction_sycl() {
|
void test_cxx11_tensor_reduction_sycl() {
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(!device.is_cpu() || s.find("amd")==std::string::npos)
|
|
||||||
CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -197,11 +197,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
|
|||||||
test_sycl_computations<DataType, ColMajor>(sycl_device);
|
test_sycl_computations<DataType, ColMajor>(sycl_device);
|
||||||
}
|
}
|
||||||
void test_cxx11_tensor_sycl() {
|
void test_cxx11_tensor_sycl() {
|
||||||
for (const auto& device : cl::sycl::device::get_devices()) {
|
auto devices =Eigen::get_sycl_supported_devices();
|
||||||
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
auto s= device.template get_info<cl::sycl::info::device::vendor>();
|
|
||||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
|
||||||
if(!device.is_cpu() || s.find("amd")==std::string::npos)
|
|
||||||
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user