Adding TensorIndexTuple and TensorTupleReduceOP backend (ArgMax/Min) for sycl; fixing the address space issue for const TensorMap; converting all discard_write to write due to data missmatch.

This commit is contained in:
Mehdi Goli 2017-03-07 14:27:10 +00:00
parent 8296b87d7b
commit f84963ed95
19 changed files with 813 additions and 226 deletions

View File

@ -119,6 +119,12 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
// required by sycl
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
return m_impl;
}
protected: protected:
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;
}; };
@ -222,7 +228,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_orig_impl(op.expression(), device), : m_orig_impl(op.expression(), device),
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
m_return_dim(op.return_dim()) { m_return_dim(op.return_dim()), m_device(device) {
gen_strides(m_orig_impl.dimensions(), m_strides); gen_strides(m_orig_impl.dimensions(), m_strides);
if (Layout == static_cast<int>(ColMajor)) { if (Layout == static_cast<int>(ColMajor)) {
@ -252,7 +258,16 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
} }
#ifndef EIGEN_USE_SYCL
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
#else // following functions are required by sycl
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}
const Device& device() const{return m_device;}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const { costPerCoeff(bool vectorized) const {
@ -292,6 +307,8 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
StrideDims m_strides; StrideDims m_strides;
Index m_stride_mod; Index m_stride_mod;
Index m_stride_div; Index m_stride_div;
// required by sycl
const Device& m_device;
}; };
} // end namespace Eigen } // end namespace Eigen

View File

@ -0,0 +1,146 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* TensorArgMaxSycl.h
* \brief:
* TensorArgMaxSycl
*
*****************************************************************/
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
namespace Eigen {
namespace internal {
template<typename Dims, typename XprType>
struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense>
{
typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type;
};
template<typename Dims, typename XprType>
struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1,
typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type>
{
typedef TensorTupleReducerDeviceOp<Dims, XprType> type;
};
template<typename StrideDims, typename XprType>
struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType>
{
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
}// end namespace internal
template<typename StrideDims, typename XprType>
class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested;
typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr,
const int return_dim,
const StrideDims& strides,
const Index& stride_mod, const Index& stride_div)
:m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
int return_dim() const { return m_return_dim; }
EIGEN_DEVICE_FUNC
const StrideDims& strides() const { return m_strides; }
EIGEN_DEVICE_FUNC
const Index& stride_mod() const { return m_stride_mod; }
EIGEN_DEVICE_FUNC
const Index& stride_div() const { return m_stride_div; }
protected:
typename Eigen::internal::remove_all<typename
XprType::Nested
>::type m_xpr;
const int m_return_dim;
const StrideDims& m_strides;
const Index m_stride_mod;
const Index m_stride_div;
};
// Eval as rvalue
template<typename StrideDims, typename ArgType>
struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>
{
typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType;
typedef typename XprType::Index Index;
typedef typename XprType::Index Scalar;
typedef Index CoeffReturnType;
typedef typename XprType::CoeffReturnType TupleType;
typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions;
enum {
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
CoordAccess = false,
RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device)
: m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
m_stride_div(op.stride_div()){}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_impl.dimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
const TupleType v = m_impl.coeff(index);
return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
}
typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast<ptr_Dev_type>(m_impl.data()); }
protected:
TensorEvaluator<ArgType , SyclKernelDevice> m_impl;
const int m_return_dim;
const StrideDims& m_strides;
const Index& m_stride_mod;
const Index& m_stride_div;
};
} // end namespace Eigen
#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP

View File

@ -11,7 +11,7 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/***************************************************************** /*****************************************************************
* TensorSyclConvertToDeviceExpression.h * TensorTensorContractionsycl.h
* *
* \brief: * \brief:
* TensorContractionsycl * TensorContractionsycl
@ -389,9 +389,9 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors,
localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice()));
}); });
self.device().asynchronousExec(); self.device().asynchronousExec();
} }

View File

@ -45,7 +45,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter
void operator()(cl::sycl::nd_item<2> itemID) { void operator()(cl::sycl::nd_item<2> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@ -103,7 +103,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter
void operator()(cl::sycl::nd_item<3> itemID) { void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@ -173,7 +173,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
void operator()(cl::sycl::nd_item<3> itemID) { void operator()(cl::sycl::nd_item<3> itemID) {
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@ -339,8 +339,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// create input tuple of accessors // create input tuple of accessors
InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType;
OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data);
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);

View File

@ -41,9 +41,8 @@ namespace Eigen {
size_t m_i; size_t m_i;
size_t m_offset; size_t m_offset;
}; };
template<typename AccType>
struct memsetkernelFunctor{ struct memsetkernelFunctor{
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
AccType m_acc; AccType m_acc;
const ptrdiff_t buff_offset; const ptrdiff_t buff_offset;
const size_t m_rng, m_c; const size_t m_rng, m_c;
@ -55,15 +54,19 @@ namespace Eigen {
}; };
//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU)
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
auto devices = cl::sycl::device::get_devices(); auto devices = cl::sycl::device::get_devices();
std::vector<cl::sycl::device>::iterator it =devices.begin(); std::vector<cl::sycl::device>::iterator it =devices.begin();
while(it!=devices.end()) { while(it!=devices.end()) {
/// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) ///FIXME: Currently there is a bug in amd cpu OpenCL
auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
std::transform(s.begin(), s.end(), s.begin(), ::tolower); std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
it=devices.erase(it); it=devices.erase(it);
//FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue.
}else if((*it).is_gpu() && s.find("intel")!=std::string::npos){
it=devices.erase(it);
} }
else{ else{
++it; ++it;
@ -112,6 +115,154 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
})) }))
#endif #endif
{} {}
//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop
template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
std::lock_guard<std::mutex> lock(mutex_);
auto host_acc= find_buffer(dst)->second. template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::host_buffer>();
::memcpy(host_acc.get_pointer(), src, n);
}
template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
std::lock_guard<std::mutex> lock(mutex_);
// Assuming that the dst is the start of the destination pointer
auto it =find_buffer(src);
auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
offset/=sizeof(Index);
size_t rng, GRange, tileSize;
parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
m_queue.submit([&](cl::sycl::handler &cgh) {
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
typedef decltype(src_acc) read_accessor;
typedef decltype(dst_acc) write_accessor;
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
});
synchronize();
}
EIGEN_STRONG_INLINE void synchronize() const {
std::lock_guard<std::mutex> lock(mutex_);
m_queue.wait_and_throw(); //pass
}
EIGEN_STRONG_INLINE void asynchronousExec() const {
///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
//sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
std::lock_guard<std::mutex> lock(mutex_);
m_queue.wait_and_throw(); //pass
}
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
auto s= m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
}
rng = n;
if (rng==0) rng=static_cast<Index>(1);
GRange=rng;
if (tileSize>GRange) tileSize=GRange;
else if(GRange>tileSize){
Index xMode = static_cast<Index>(GRange % tileSize);
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
}
}
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
}
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
rng1=dim1;
if (rng1==0 ) rng1=static_cast<Index>(1);
GRange1=rng1;
if (tileSize1>GRange1) tileSize1=GRange1;
else if(GRange1>tileSize1){
Index xMode = static_cast<Index>(GRange1 % tileSize1);
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
}
tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
rng0 = dim0;
if (rng0==0 ) rng0=static_cast<Index>(1);
GRange0=rng0;
if (tileSize0>GRange0) tileSize0=GRange0;
else if(GRange0>tileSize0){
Index xMode = static_cast<Index>(GRange0 % tileSize0);
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
}
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
}
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
rng2=dim2;
if (rng2==0 ) rng1=static_cast<Index>(1);
GRange2=rng2;
if (tileSize2>GRange2) tileSize2=GRange2;
else if(GRange2>tileSize2){
Index xMode = static_cast<Index>(GRange2 % tileSize2);
if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
}
pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
rng1=dim1;
if (rng1==0 ) rng1=static_cast<Index>(1);
GRange1=rng1;
if (tileSize1>GRange1) tileSize1=GRange1;
else if(GRange1>tileSize1){
Index xMode = static_cast<Index>(GRange1 % tileSize1);
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
}
tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
rng0 = dim0;
if (rng0==0 ) rng0=static_cast<Index>(1);
GRange0=rng0;
if (tileSize0>GRange0) tileSize0=GRange0;
else if(GRange0>tileSize0){
Index xMode = static_cast<Index>(GRange0 % tileSize0);
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
}
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
std::lock_guard<std::mutex> lock(mutex_);
return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
// return stream_->deviceProperties().multiProcessorCount;
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
std::lock_guard<std::mutex> lock(mutex_);
return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
// return stream_->deviceProperties().maxThreadsPerBlock;
}
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
std::lock_guard<std::mutex> lock(mutex_);
// OpenCL doesnot have such concept
return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
// return stream_->deviceProperties().maxThreadsPerMultiProcessor;
}
EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
std::lock_guard<std::mutex> lock(mutex_);
return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
// return stream_->deviceProperties().sharedMemPerBlock;
}
/// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
/// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
@ -119,10 +270,10 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
/// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
/// The device pointer would be deleted by calling deallocate function. /// The device pointer would be deleted by calling deallocate function.
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
std::lock_guard<std::mutex> lock(mutex_);
auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes)); auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
buf.set_final_data(nullptr); buf.set_final_data(nullptr);
std::lock_guard<std::mutex> lock(mutex_);
buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
return static_cast<void*>(ptr); return static_cast<void*>(ptr);
} }
@ -193,48 +344,13 @@ struct SyclDevice {
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index> template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange);
auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
}
rng = n;
if (rng==0) rng=static_cast<Index>(1);
GRange=rng;
if (tileSize>GRange) tileSize=GRange;
else if(GRange>tileSize){
Index xMode = static_cast<Index>(GRange % tileSize);
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
}
} }
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index> template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1);
if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
}
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
rng1=dim1;
if (rng1==0 ) rng1=static_cast<Index>(1);
GRange1=rng1;
if (tileSize1>GRange1) tileSize1=GRange1;
else if(GRange1>tileSize1){
Index xMode = static_cast<Index>(GRange1 % tileSize1);
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
}
tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
rng0 = dim0;
if (rng0==0 ) rng0=static_cast<Index>(1);
GRange0=rng0;
if (tileSize0>GRange0) tileSize0=GRange0;
else if(GRange0>tileSize0){
Index xMode = static_cast<Index>(GRange0 % tileSize0);
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
} }
@ -242,39 +358,8 @@ struct SyclDevice {
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
template<typename Index> template<typename Index>
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2);
if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
}
Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
rng2=dim2;
if (rng2==0 ) rng1=static_cast<Index>(1);
GRange2=rng2;
if (tileSize2>GRange2) tileSize2=GRange2;
else if(GRange2>tileSize2){
Index xMode = static_cast<Index>(GRange2 % tileSize2);
if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
}
pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
rng1=dim1;
if (rng1==0 ) rng1=static_cast<Index>(1);
GRange1=rng1;
if (tileSize1>GRange1) tileSize1=GRange1;
else if(GRange1>tileSize1){
Index xMode = static_cast<Index>(GRange1 % tileSize1);
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
}
tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
rng0 = dim0;
if (rng0==0 ) rng0=static_cast<Index>(1);
GRange0=rng0;
if (tileSize0>GRange0) tileSize0=GRange0;
else if(GRange0>tileSize0){
Index xMode = static_cast<Index>(GRange0 % tileSize0);
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
}
} }
/// allocate device memory /// allocate device memory
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
@ -319,8 +404,7 @@ struct SyclDevice {
/// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
/// this buffer is accessed, the data will be copied to the device. /// this buffer is accessed, the data will be copied to the device.
template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); m_queue_stream->memcpyHostToDevice(dst,src,n);
::memcpy(host_acc.get_pointer(), src, n);
} }
/// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
/// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
@ -329,21 +413,7 @@ struct SyclDevice {
/// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
/// to the cpu only once per function call. /// to the cpu only once per function call.
template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
auto it = m_queue_stream->find_buffer(src); m_queue_stream->memcpyDeviceToHost(dst,src,n);
auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
offset/=sizeof(Index);
size_t rng, GRange, tileSize;
parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
// Assuming that the dst is the start of the destination pointer
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
sycl_queue().submit([&](cl::sycl::handler &cgh) {
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
typedef decltype(src_acc) read_accessor;
typedef decltype(dst_acc) write_accessor;
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
});
synchronize();
} }
/// returning the sycl queue /// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
@ -366,8 +436,9 @@ struct SyclDevice {
:m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
void operator()(cl::sycl::handler &cgh) const { void operator()(cl::sycl::handler &cgh) const {
auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); typedef decltype(buf_acc) AccType;
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c));
} }
}; };
@ -403,14 +474,13 @@ struct SyclDevice {
EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
EIGEN_STRONG_INLINE void synchronize() const { EIGEN_STRONG_INLINE void synchronize() const {
sycl_queue().wait_and_throw(); //pass m_queue_stream->synchronize(); //pass
} }
EIGEN_STRONG_INLINE void asynchronousExec() const { EIGEN_STRONG_INLINE void asynchronousExec() const {
///FIXEDME:: currently there is a race condition regarding the asynch scheduler. ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
//sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
sycl_queue().wait_and_throw(); //pass m_queue_stream->asynchronousExec();
} }
// This function checks if the runtime recorded an error for the // This function checks if the runtime recorded an error for the
// underlying stream device. // underlying stream device.
@ -418,8 +488,10 @@ struct SyclDevice {
return m_queue_stream->ok(); return m_queue_stream->ok();
} }
}; };
// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
struct SyclKernelDevice:DefaultDevice{};
} // end namespace Eigen } // end namespace Eigen

View File

@ -193,7 +193,12 @@ struct TensorEvaluator<const Derived, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_data); eigen_assert(m_data);
#ifndef __SYCL_DEVICE_ONLY__
return loadConstant(m_data+index); return loadConstant(m_data+index);
#else
CoeffReturnType tmp = m_data[index];
return tmp;
#endif
} }
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

View File

@ -124,7 +124,9 @@ template <typename U, typename V> struct Tuple {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Tuple& operator= (const Tuple& rhs) { Tuple& operator= (const Tuple& rhs) {
#ifndef __SYCL_DEVICE_ONLY__
if (&rhs == this) return *this; if (&rhs == this) return *this;
#endif
first = rhs.first; first = rhs.first;
second = rhs.second; second = rhs.second;
return *this; return *this;

View File

@ -35,7 +35,7 @@ static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI
/* Two accessors are used: one to the buffer that is being reduced, /* Two accessors are used: one to the buffer that is being reduced,
* and a second to local memory, used to store intermediate data. */ * and a second to local memory, used to store intermediate data. */
auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h);
typedef decltype(aI) InputAccessor; typedef decltype(aI) InputAccessor;
typedef decltype(aOut) OutputAccessor; typedef decltype(aOut) OutputAccessor;
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
@ -158,7 +158,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
// create a tuple of accessors from Evaluator // create a tuple of accessors from Evaluator
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output);
ptrdiff_t out_offset = dev.get_offset(output); ptrdiff_t out_offset = dev.get_offset(output);
Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),

View File

@ -32,6 +32,8 @@ struct MakeLocalPointer {
namespace Eigen { namespace Eigen {
template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp;
template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>;
namespace TensorSycl { namespace TensorSycl {
namespace internal { namespace internal {
@ -48,6 +50,13 @@ template<typename T> struct GetType<false, T>{
typedef T Type; typedef T Type;
}; };
template <bool Conds, size_t X , size_t Y > struct ValueCondition {
static const size_t Res =X;
};
template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
static const size_t Res =Y;
};
} }
} }
} }
@ -80,6 +89,9 @@ template<typename T> struct GetType<false, T>{
/// this is used for extracting tensor reduction /// this is used for extracting tensor reduction
#include "TensorReductionSycl.h" #include "TensorReductionSycl.h"
// TensorArgMaxSycl.h
#include "TensorArgMaxSycl.h"
/// this is used for extracting tensor convolution /// this is used for extracting tensor convolution
#include "TensorConvolutionSycl.h" #include "TensorConvolutionSycl.h"

View File

@ -103,7 +103,7 @@ KERNELBROKERCONVERT(, false, TensorEvalToOp)
#undef KERNELBROKERCONVERT #undef KERNELBROKERCONVERT
/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp /// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp
#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\ #define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\
template <typename Expr>\ template <typename Expr>\
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\ typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
@ -111,15 +111,17 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
// TensorForcedEvalOp // TensorForcedEvalOp
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp)
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp)
// TensorLayoutSwapOp // TensorLayoutSwapOp
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp)
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp)
#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP
//TensorIndexTupleOp
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp)
KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp)
#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
#define KERNELBROKERCONVERTREDUCTION(CVQual)\ #define KERNELBROKERCONVERTREDUCTION(CVQual)\
@ -132,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const)
KERNELBROKERCONVERTREDUCTION() KERNELBROKERCONVERTREDUCTION()
#undef KERNELBROKERCONVERTREDUCTION #undef KERNELBROKERCONVERTREDUCTION
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\
template <typename OP, typename Dim, typename subExpr>\
struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\
typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\
};
KERNELBROKERCONVERTTUPLEREDUCTION(const)
KERNELBROKERCONVERTTUPLEREDUCTION()
#undef KERNELBROKERCONVERTTUPLEREDUCTION
//TensorSlicingOp
#define KERNELBROKERCONVERTSLICEOP(CVQual)\ #define KERNELBROKERCONVERTSLICEOP(CVQual)\
template<typename StartIndices, typename Sizes, typename XprType>\ template<typename StartIndices, typename Sizes, typename XprType>\
struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\ struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
@ -142,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const)
KERNELBROKERCONVERTSLICEOP() KERNELBROKERCONVERTSLICEOP()
#undef KERNELBROKERCONVERTSLICEOP #undef KERNELBROKERCONVERTSLICEOP
//TensorStridingSlicingOp
#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\ struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
@ -153,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const)
KERNELBROKERCONVERTERSLICESTRIDEOP() KERNELBROKERCONVERTERSLICESTRIDEOP()
#undef KERNELBROKERCONVERTERSLICESTRIDEOP #undef KERNELBROKERCONVERTERSLICESTRIDEOP
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
template <DenseIndex DimId, typename Expr>\ template <DenseIndex DimId, typename Expr>\
@ -164,9 +177,6 @@ KERNELBROKERCONVERTCHIPPINGOP(const)
KERNELBROKERCONVERTCHIPPINGOP() KERNELBROKERCONVERTCHIPPINGOP()
#undef KERNELBROKERCONVERTCHIPPINGOP #undef KERNELBROKERCONVERTCHIPPINGOP
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp
#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ #define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\
template<DenseIndex Rows, DenseIndex Cols, typename XprType>\ template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
@ -188,8 +198,6 @@ KERNELBROKERCONVERTVOLUMEPATCHOP(const)
KERNELBROKERCONVERTVOLUMEPATCHOP() KERNELBROKERCONVERTVOLUMEPATCHOP()
#undef KERNELBROKERCONVERTVOLUMEPATCHOP #undef KERNELBROKERCONVERTVOLUMEPATCHOP
} // namespace internal } // namespace internal
} // namespace TensorSycl } // namespace TensorSycl
} // namespace Eigen } // namespace Eigen

View File

@ -65,7 +65,6 @@ CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
: expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
}; };
TENSORMAP(const) TENSORMAP(const)
TENSORMAP() TENSORMAP()
#undef TENSORMAP #undef TENSORMAP
@ -83,6 +82,7 @@ CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Option
ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\ ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
: expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\ : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
}; };
TENSORMAPFIXEDSIZE(const) TENSORMAPFIXEDSIZE(const)
TENSORMAPFIXEDSIZE() TENSORMAPFIXEDSIZE()
#undef TENSORMAPFIXEDSIZE #undef TENSORMAPFIXEDSIZE
@ -189,9 +189,6 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual
ASSIGN() ASSIGN()
#undef ASSIGN #undef ASSIGN
/// specialisation of the \ref ExprConstructor struct when the node type is /// specialisation of the \ref ExprConstructor struct when the node type is
/// const TensorAssignOp /// const TensorAssignOp
#define CONVERSIONEXPRCONST(CVQual)\ #define CONVERSIONEXPRCONST(CVQual)\
@ -252,8 +249,6 @@ FORCEDEVAL(const)
FORCEDEVAL() FORCEDEVAL()
#undef FORCEDEVAL #undef FORCEDEVAL
#define TENSORCUSTOMUNARYOP(CVQual)\ #define TENSORCUSTOMUNARYOP(CVQual)\
template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\ struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
@ -274,13 +269,6 @@ TENSORCUSTOMUNARYOP(const)
TENSORCUSTOMUNARYOP() TENSORCUSTOMUNARYOP()
#undef TENSORCUSTOMUNARYOP #undef TENSORCUSTOMUNARYOP
template <bool Conds, size_t X , size_t Y > struct ValueCondition {
static const size_t Res =X;
};
template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
static const size_t Res =Y;
};
/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
#define SYCLREDUCTIONEXPR(CVQual)\ #define SYCLREDUCTIONEXPR(CVQual)\
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
@ -299,6 +287,35 @@ SYCLREDUCTIONEXPR(const)
SYCLREDUCTIONEXPR() SYCLREDUCTIONEXPR()
#undef SYCLREDUCTIONEXPR #undef SYCLREDUCTIONEXPR
/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp
/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP.
#define SYCLTUPLEREDUCTIONEXPR(CVQual)\
template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\
CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\
static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\
static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\
static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\
typedef CVQual TensorMap<\
Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\
Layout,\
MakeGlobalPointer\
> XprType;\
typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\
static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\
typedef array<Index, NumDims> StrideDims;\
typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
:expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\
fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\
}\
};
SYCLTUPLEREDUCTIONEXPR(const)
SYCLTUPLEREDUCTIONEXPR()
#undef SYCLTUPLEREDUCTIONEXPR
/// specialisation of the \ref ExprConstructor struct when the node type is /// specialisation of the \ref ExprConstructor struct when the node type is
/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp /// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
@ -319,15 +336,18 @@ CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params
:expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
}; };
//TensorContractionOp
SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
//TensorConvolutionOp
SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
//TensorCustomBinaryOp
SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
#undef SYCLCONTRACTCONVCUSBIOPS #undef SYCLCONTRACTCONVCUSBIOPS
//TensorSlicingOp
#define SYCLSLICEOPEXPR(CVQual)\ #define SYCLSLICEOPEXPR(CVQual)\
template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\ struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
@ -344,7 +364,7 @@ SYCLSLICEOPEXPR(const)
SYCLSLICEOPEXPR() SYCLSLICEOPEXPR()
#undef SYCLSLICEOPEXPR #undef SYCLSLICEOPEXPR
//TensorStridingSlicingOp
#define SYCLSLICESTRIDEOPEXPR(CVQual)\ #define SYCLSLICESTRIDEOPEXPR(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\ template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\ struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
@ -361,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const)
SYCLSLICESTRIDEOPEXPR() SYCLSLICESTRIDEOPEXPR()
#undef SYCLSLICESTRIDEOPEXPR #undef SYCLSLICESTRIDEOPEXPR
//TensorReshapingOp and TensorShufflingOp
#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
@ -373,13 +394,15 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param
: xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
}; };
// TensorReshapingOp
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
// TensorShufflingOp
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
//TensorPaddingOp
#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ #define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
@ -392,11 +415,11 @@ struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param
: xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
}; };
//TensorPaddingOp
SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
#undef SYCLPADDINGOPEXPRCONST #undef SYCLPADDINGOPEXPRCONST
// TensorChippingOp // TensorChippingOp
#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ #define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
@ -454,14 +477,12 @@ SYCLTENSORVOLUMEPATCHOPEXPR(const)
SYCLTENSORVOLUMEPATCHOPEXPR() SYCLTENSORVOLUMEPATCHOPEXPR()
#undef SYCLTENSORVOLUMEPATCHOPEXPR #undef SYCLTENSORVOLUMEPATCHOPEXPR
// TensorLayoutSwapOp and TensorIndexTupleOp
#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\
// TensorLayoutSwapOp
#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\
template<typename OrigXprType, typename XprType, typename... Params>\ template<typename OrigXprType, typename XprType, typename... Params>\
struct ExprConstructor<CVQual TensorLayoutSwapOp <OrigXprType> , CVQual TensorLayoutSwapOp<XprType>, Params... >{\ struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\
typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
typedef CVQual TensorLayoutSwapOp<typename my_xpr_type::Type> Type;\ typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\
my_xpr_type xprExpr;\ my_xpr_type xprExpr;\
Type expr;\ Type expr;\
template <typename FuncDetector>\ template <typename FuncDetector>\
@ -469,10 +490,14 @@ struct ExprConstructor<CVQual TensorLayoutSwapOp <OrigXprType> , CVQual TensorLa
: xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\
}; };
SYCLTENSORLAYOUTSWAPOPEXPR(const) //TensorLayoutSwapOp
SYCLTENSORLAYOUTSWAPOPEXPR() SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp)
#undef SYCLTENSORLAYOUTSWAPOPEXPR SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp)
//TensorIndexTupleOp
SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp)
SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp)
#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR
/// template deduction for \ref ExprConstructor struct /// template deduction for \ref ExprConstructor struct
template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>

View File

@ -147,7 +147,7 @@ SYCLFORCEDEVALEXTACC(const)
SYCLFORCEDEVALEXTACC() SYCLFORCEDEVALEXTACC()
#undef SYCLFORCEDEVALEXTACC #undef SYCLFORCEDEVALEXTACC
//TensorCustomUnaryOp
#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ #define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
template <typename CustomUnaryFunc, typename XprType, typename Dev >\ template <typename CustomUnaryFunc, typename XprType, typename Dev >\
struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\ struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
@ -160,7 +160,7 @@ SYCLCUSTOMUNARYOPEXTACC(const)
SYCLCUSTOMUNARYOPEXTACC() SYCLCUSTOMUNARYOPEXTACC()
#undef SYCLCUSTOMUNARYOPEXTACC #undef SYCLCUSTOMUNARYOPEXTACC
//TensorCustomBinaryOp
#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ #define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\ template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\ struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
@ -172,9 +172,6 @@ SYCLCUSTOMBINARYOPEXTACC(const)
SYCLCUSTOMBINARYOPEXTACC() SYCLCUSTOMBINARYOPEXTACC()
#undef SYCLCUSTOMBIBARYOPEXTACC #undef SYCLCUSTOMBIBARYOPEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
#define SYCLEVALTOEXTACC(CVQual)\ #define SYCLEVALTOEXTACC(CVQual)\
template <typename Expr, typename Dev>\ template <typename Expr, typename Dev>\
@ -188,15 +185,19 @@ SYCLEVALTOEXTACC()
#undef SYCLEVALTOEXTACC #undef SYCLEVALTOEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
#define SYCLREDUCTIONEXTACC(CVQual)\ #define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\
template <typename OP, typename Dim, typename Expr, typename Dev>\ template <typename OP, typename Dim, typename Expr, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
}; };
// TensorReductionOp
SYCLREDUCTIONEXTACC(const,TensorReductionOp)
SYCLREDUCTIONEXTACC(,TensorReductionOp)
SYCLREDUCTIONEXTACC(const) // TensorTupleReducerOp
SYCLREDUCTIONEXTACC() SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp)
SYCLREDUCTIONEXTACC(,TensorTupleReducerOp)
#undef SYCLREDUCTIONEXTACC #undef SYCLREDUCTIONEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
@ -206,14 +207,14 @@ template<typename Indices, typename LhsXprType, typename RhsXprType, typename De
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
}; };
//TensorContractionOp
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
//TensorConvolutionOp
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
#undef SYCLCONTRACTIONCONVOLUTIONEXTACC #undef SYCLCONTRACTIONCONVOLUTIONEXTACC
/// specialisation of the \ref ExtractAccessor struct when the node type is /// specialisation of the \ref ExtractAccessor struct when the node type is
/// const TensorSlicingOp. /// const TensorSlicingOp.
#define SYCLSLICEOPEXTACC(CVQual)\ #define SYCLSLICEOPEXTACC(CVQual)\
@ -252,7 +253,6 @@ SYCLTENSORCHIPPINGOPEXTACC(const)
SYCLTENSORCHIPPINGOPEXTACC() SYCLTENSORCHIPPINGOPEXTACC()
#undef SYCLTENSORCHIPPINGOPEXTACC #undef SYCLTENSORCHIPPINGOPEXTACC
// specialisation of the \ref ExtractAccessor struct when the node type is // specialisation of the \ref ExtractAccessor struct when the node type is
/// TensorImagePatchOp. /// TensorImagePatchOp.
#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ #define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\
@ -266,8 +266,6 @@ SYCLTENSORIMAGEPATCHOPEXTACC(const)
SYCLTENSORIMAGEPATCHOPEXTACC() SYCLTENSORIMAGEPATCHOPEXTACC()
#undef SYCLTENSORIMAGEPATCHOPEXTACC #undef SYCLTENSORIMAGEPATCHOPEXTACC
// specialisation of the \ref ExtractAccessor struct when the node type is // specialisation of the \ref ExtractAccessor struct when the node type is
/// TensorVolumePatchOp. /// TensorVolumePatchOp.
#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ #define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\
@ -281,21 +279,23 @@ SYCLTENSORVOLUMEPATCHOPEXTACC(const)
SYCLTENSORVOLUMEPATCHOPEXTACC() SYCLTENSORVOLUMEPATCHOPEXTACC()
#undef SYCLTENSORVOLUMEPATCHOPEXTACC #undef SYCLTENSORVOLUMEPATCHOPEXTACC
// specialisation of the \ref ExtractAccessor struct when the node type is // specialisation of the \ref ExtractAccessor struct when the node type is
/// TensorLayoutSwapOp. /// TensorLayoutSwapOp, TensorIndexTupleOp
#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ #define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\
template<typename XprType, typename Dev>\ template<typename XprType, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorLayoutSwapOp<XprType>, Dev> >{\ struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorLayoutSwapOp<XprType>, Dev>& eval)\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
}; };
SYCLTENSORLAYOUTSWAPOPEXTACC(const) // TensorLayoutSwapOp
SYCLTENSORLAYOUTSWAPOPEXTACC() SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp)
#undef SYCLTENSORLAYOUTSWAPOPEXTACC SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp)
//TensorIndexTupleOp
SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp)
SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp)
#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC
/// template deduction for \ref ExtractAccessor /// template deduction for \ref ExtractAccessor
template <typename Evaluator> template <typename Evaluator>

View File

@ -126,19 +126,19 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFu
typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\ typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
DEFALTACTION(Evaluator)\ DEFALTACTION(Evaluator)\
}; };
//TensorCustomUnaryOp
SYCLEXTRFUNCCUSTOMUNARYOP(const) SYCLEXTRFUNCCUSTOMUNARYOP(const)
SYCLEXTRFUNCCUSTOMUNARYOP() SYCLEXTRFUNCCUSTOMUNARYOP()
#undef SYCLEXTRFUNCCUSTOMUNARYOP #undef SYCLEXTRFUNCCUSTOMUNARYOP
//TensorCustomBinaryOp
#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ #define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\ template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\ struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\ typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
DEFALTACTION(Evaluator)\ DEFALTACTION(Evaluator)\
}; };
//TensorCustomBinaryOp
SYCLEXTRFUNCCUSTOMBIBARYOP(const) SYCLEXTRFUNCCUSTOMBIBARYOP(const)
SYCLEXTRFUNCCUSTOMBIBARYOP() SYCLEXTRFUNCCUSTOMBIBARYOP()
#undef SYCLEXTRFUNCCUSTOMBIBARYOP #undef SYCLEXTRFUNCCUSTOMBIBARYOP
@ -177,7 +177,7 @@ SYCLEXTRFUNCASSIGNOP()
/// specialisation of the \ref FunctorExtractor struct when the node types are /// specialisation of the \ref FunctorExtractor struct when the node types are
/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. /// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated.
#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\ #define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\
template <typename Expr, typename Dev>\ template <typename Expr, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\ FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\
@ -185,13 +185,16 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
: xprExpr(expr.impl()) {}\ : xprExpr(expr.impl()) {}\
}; };
//TensorEvalToOp //TensorEvalToOp
SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp) SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp)
SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp) SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp)
// TensorLayoutSwapOp // TensorLayoutSwapOp
SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp) SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp)
SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp) SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp)
// TensorIndexTupleOp
SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp)
SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp)
#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT #undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE
template<typename Dim, size_t NumOutputDim> struct DimConstr { template<typename Dim, size_t NumOutputDim> struct DimConstr {
template<typename InDim> template<typename InDim>
@ -202,10 +205,10 @@ template<typename Dim> struct DimConstr<Dim, 0> {
template<typename InDim> template<typename InDim>
static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));} static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
}; };
//TensorReductionOp
#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\ template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\ struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\
typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\ typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
const Dimensions m_dimensions;\ const Dimensions m_dimensions;\
@ -213,12 +216,39 @@ struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgTy
FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\ FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
: m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\ : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
}; };
SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP(const)
SYCLEXTRFUNCREDUCTIONOP() SYCLEXTRFUNCREDUCTIONOP()
#undef SYCLEXTRFUNCREDUCTIONOP #undef SYCLEXTRFUNCREDUCTIONOP
//TensorTupleReducerOp
#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\
template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\
typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\
static const int NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\
typedef typename Evaluator::StrideDims StrideDims;\
typedef typename Evaluator::Index Index;\
typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
const Dimensions m_dimensions;\
const int m_return_dim;\
const StrideDims m_strides;\
const Index m_stride_mod;\
const Index m_stride_div;\
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}\
EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\
EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\
EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\
FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\
: m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\
m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\
};
SYCLEXTRFUNCTUPLEREDUCTIONOP(const)
SYCLEXTRFUNCTUPLEREDUCTIONOP()
#undef SYCLEXTRFUNCTUPLEREDUCTIONOP
//TensorContractionOp and TensorConvolutionOp
#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
@ -230,9 +260,10 @@ struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, Rhs
: m_dimensions(expr.dimensions()) {}\ : m_dimensions(expr.dimensions()) {}\
}; };
//TensorContractionOp
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
//TensorConvolutionOp
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP #undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
@ -255,6 +286,7 @@ SYCLEXTRFUNCTSLICEOP(const)
SYCLEXTRFUNCTSLICEOP() SYCLEXTRFUNCTSLICEOP()
#undef SYCLEXTRFUNCTSLICEOP #undef SYCLEXTRFUNCTSLICEOP
//TensorStridingSlicingOp
#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ #define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
@ -273,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const)
SYCLEXTRFUNCTSLICESTRIDEOP() SYCLEXTRFUNCTSLICESTRIDEOP()
#undef SYCLEXTRFUNCTSLICESTRIDEOP #undef SYCLEXTRFUNCTSLICESTRIDEOP
// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory // Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory
#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
template<typename Param, typename XprType, typename Dev>\ template<typename Param, typename XprType, typename Dev>\
struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
@ -284,9 +316,11 @@ struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprTy
: xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\ : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
}; };
//TensorReshapingOp
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
//TensorShufflingOp
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT #undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
@ -343,6 +377,7 @@ SYCLEXTRFUNCCHIPPINGOP(const)
SYCLEXTRFUNCCHIPPINGOP() SYCLEXTRFUNCCHIPPINGOP()
#undef SYCLEXTRFUNCCHIPPINGOP #undef SYCLEXTRFUNCCHIPPINGOP
//TensorImagePatchOp
#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ #define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\
template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\ template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\ struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\
@ -420,7 +455,6 @@ SYCLEXTRFUNCVOLUMEPATCHOP()
#undef SYCLEXTRFUNCVOLUMEPATCHOP #undef SYCLEXTRFUNCVOLUMEPATCHOP
/// template deduction function for FunctorExtractor /// template deduction function for FunctorExtractor
template <typename Evaluator> template <typename Evaluator>
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> { auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {

View File

@ -72,7 +72,7 @@ namespace internal {
template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
public: public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
:output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
void operator()(cl::sycl::nd_item<1> itemID) { void operator()(cl::sycl::nd_item<1> itemID) {
@ -85,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device. /// the device_evaluator is detectable and recognisable on the device.
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
/// const cast added as a naive solution to solve the qualifier drop error /// const cast added as a naive solution to solve the qualifier drop error
auto globalid=static_cast<Index>(itemID.get_global_linear_id()); auto globalid=static_cast<Index>(itemID.get_global_linear_id());
@ -111,7 +111,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
public: public:
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
@ -126,8 +126,8 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device. /// the device_evaluator is detectable and recognisable on the device.
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
/// const cast added as a naive solution to solve the qualifier drop error /// const cast added as a naive solution to solve the qualifier drop error
auto globalid=static_cast<Index>(itemID.get_global_linear_id()); auto globalid=static_cast<Index>(itemID.get_global_linear_id());
@ -173,7 +173,7 @@ public:
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device. /// the device_evaluator is detectable and recognisable on the device.
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
/// const cast added as a naive solution to solve the qualifier drop error /// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id(); auto globalid=itemID.get_global_linear_id();
@ -220,7 +220,7 @@ public:
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
/// the device_evaluator is detectable and recognisable on the device. /// the device_evaluator is detectable and recognisable on the device.
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
/// const cast added as a naive solution to solve the qualifier drop error /// const cast added as a naive solution to solve the qualifier drop error
auto globalid=itemID.get_global_linear_id(); auto globalid=itemID.get_global_linear_id();
auto scale = (rng*red_factor) + remaining; auto scale = (rng*red_factor) + remaining;

View File

@ -114,27 +114,37 @@ SYCLCUSTOMBINARYOPLEAFCOUNT()
#undef SYCLCUSTOMBINARYOPLEAFCOUNT #undef SYCLCUSTOMBINARYOPLEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ #define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\
template <typename Expr>\ template <typename Expr>\
struct LeafCount<CVQual ExprNode<Expr> > {\ struct LeafCount<CVQual ExprNode<Expr> > {\
static const size_t Count = Num + CategoryCount<Expr>::Count;\ static const size_t Count = Num + CategoryCount<Expr>::Count;\
}; };
EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1) EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1)
EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1) EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1)
EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0) EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0)
EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0) EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0)
#undef EVALTOLAYOUTSWAPLEAFCOUNT
EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0)
EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0)
#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
#define REDUCTIONLEAFCOUNT(CVQual)\ #define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\
template <typename OP, typename Dim, typename Expr>\ template <typename OP, typename Dim, typename Expr>\
struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\ struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\
static const size_t Count =1;\ static const size_t Count =1;\
}; };
REDUCTIONLEAFCOUNT(const) // TensorReductionOp
REDUCTIONLEAFCOUNT() REDUCTIONLEAFCOUNT(const,TensorReductionOp)
REDUCTIONLEAFCOUNT(,TensorReductionOp)
// tensor Argmax -TensorTupleReducerOp
REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp)
REDUCTIONLEAFCOUNT(, TensorTupleReducerOp)
#undef REDUCTIONLEAFCOUNT #undef REDUCTIONLEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
@ -150,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
#undef CONTRACTIONCONVOLUTIONLEAFCOUNT #undef CONTRACTIONCONVOLUTIONLEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
#define SLICEOPLEAFCOUNT(CVQual)\ #define SLICEOPLEAFCOUNT(CVQual)\
template <typename StartIndices, typename Sizes, typename XprType>\ template <typename StartIndices, typename Sizes, typename XprType>\
@ -161,7 +169,6 @@ SLICEOPLEAFCOUNT(const)
SLICEOPLEAFCOUNT() SLICEOPLEAFCOUNT()
#undef SLICEOPLEAFCOUNT #undef SLICEOPLEAFCOUNT
/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp /// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp
#define CHIPPINGOPLEAFCOUNT(CVQual)\ #define CHIPPINGOPLEAFCOUNT(CVQual)\
template <DenseIndex DimId, typename XprType>\ template <DenseIndex DimId, typename XprType>\
@ -195,7 +202,6 @@ TENSORIMAGEPATCHOPLEAFCOUNT()
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\ template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{}; struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{};
TENSORVOLUMEPATCHOPLEAFCOUNT(const) TENSORVOLUMEPATCHOPLEAFCOUNT(const)
TENSORVOLUMEPATCHOPLEAFCOUNT() TENSORVOLUMEPATCHOPLEAFCOUNT()
#undef TENSORVOLUMEPATCHOPLEAFCOUNT #undef TENSORVOLUMEPATCHOPLEAFCOUNT

View File

@ -171,19 +171,24 @@ CUSTOMBINARYOPEVAL()
/// specialisation of the \ref PlaceHolderExpression when the node is /// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorEvalToOp, TensorLayoutSwapOp /// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp
#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ #define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\
template <typename Expr, size_t N>\ template <typename Expr, size_t N>\
struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\ struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\
typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\ typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\
}; };
EVALTOLAYOUTSWAP(const, TensorEvalToOp) // TensorEvalToOp
EVALTOLAYOUTSWAP(, TensorEvalToOp) EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp)
EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp) EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp)
EVALTOLAYOUTSWAP(, TensorLayoutSwapOp) //TensorLayoutSwapOp
EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp)
EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp)
//TensorIndexTupleOp
EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp)
EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp)
#undef EVALTOLAYOUTSWAP #undef EVALTOLAYOUTSWAPINDEXTUPLE
/// specialisation of the \ref PlaceHolderExpression when the node is /// specialisation of the \ref PlaceHolderExpression when the node is
@ -199,17 +204,24 @@ CHIPPINGOP()
#undef CHIPPINGOP #undef CHIPPINGOP
/// specialisation of the \ref PlaceHolderExpression when the node is /// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorReductionOp /// TensorReductionOp and TensorTupleReducerOp (Argmax)
#define SYCLREDUCTION(CVQual)\ #define SYCLREDUCTION(CVQual, ExprNode)\
template <typename OP, typename Dims, typename Expr, size_t N>\ template <typename OP, typename Dims, typename Expr, size_t N>\
struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\ struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\
typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\ typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\
}; };
SYCLREDUCTION(const)
SYCLREDUCTION() // tensor reduction
SYCLREDUCTION(const, TensorReductionOp)
SYCLREDUCTION(, TensorReductionOp)
// tensor Argmax -TensorTupleReducerOp
SYCLREDUCTION(const, TensorTupleReducerOp)
SYCLREDUCTION(, TensorTupleReducerOp)
#undef SYCLREDUCTION #undef SYCLREDUCTION
/// specialisation of the \ref PlaceHolderExpression when the node is /// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorReductionOp /// TensorReductionOp
#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\

View File

@ -25,7 +25,6 @@
namespace Eigen { namespace Eigen {
namespace TensorSycl { namespace TensorSycl {
template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
@ -38,7 +37,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
void operator()(cl::sycl::nd_item<1> itemID) { void operator()(cl::sycl::nd_item<1> itemID) {
typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
if (gId < range) if (gId < range)
device_evaluator.evalScalar(gId); device_evaluator.evalScalar(gId);

View File

@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11")
endif(EIGEN_TEST_SYCL) endif(EIGEN_TEST_SYCL)
# It should be safe to always run these tests as there is some fallback code for # It should be safe to always run these tests as there is some fallback code for

View File

@ -0,0 +1,248 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::array;
using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
template <typename DataType, int Layout, typename DenseIndex>
static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}});
Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
in.setRandom();
in *= in.constant(100.0);
in(0, 0, 0) = -1000.0;
in(1, 1, 1) = 1000.0;
std::size_t in_bytes = in.size() * sizeof(DataType);
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}});
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes);
gpu_out_max.device(sycl_device) = gpu_in.argmax();
gpu_out_min.device(sycl_device) = gpu_in.argmin();
sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1);
VERIFY_IS_EQUAL(out_min(), 0);
sycl_device.deallocate(d_in);
sycl_device.deallocate(d_out_max);
sycl_device.deallocate(d_out_min);
}
template <typename DataType, int DataLayout, typename DenseIndex>
static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
{
DenseIndex sizeDim0=9;
DenseIndex sizeDim1=3;
DenseIndex sizeDim2=5;
DenseIndex sizeDim3=7;
Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
std::vector<DenseIndex> dims;
dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
for (DenseIndex dim = 0; dim < 4; ++dim) {
array<DenseIndex, 3> out_shape;
for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
array<DenseIndex, 4> ix;
for (DenseIndex i = 0; i < sizeDim0; ++i) {
for (DenseIndex j = 0; j < sizeDim1; ++j) {
for (DenseIndex k = 0; k < sizeDim2; ++k) {
for (DenseIndex l = 0; l < sizeDim3; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
tensor(ix)=(ix[dim] != 0)?-1.0:10.0;
}
}
}
}
std::size_t in_bytes = tensor.size() * sizeof(DataType);
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
gpu_out.device(sycl_device) = gpu_in.argmax(dim);
sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the first index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
}
sycl_device.synchronize();
for (DenseIndex i = 0; i < sizeDim0; ++i) {
for (DenseIndex j = 0; j < sizeDim1; ++j) {
for (DenseIndex k = 0; k < sizeDim2; ++k) {
for (DenseIndex l = 0; l < sizeDim3; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0;
}
}
}
}
sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
gpu_out.device(sycl_device) = gpu_in.argmax(dim);
sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
sycl_device.deallocate(d_in);
sycl_device.deallocate(d_out);
}
}
template <typename DataType, int DataLayout, typename DenseIndex>
static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
{
DenseIndex sizeDim0=9;
DenseIndex sizeDim1=3;
DenseIndex sizeDim2=5;
DenseIndex sizeDim3=7;
Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
std::vector<DenseIndex> dims;
dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
for (DenseIndex dim = 0; dim < 4; ++dim) {
array<DenseIndex, 3> out_shape;
for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
array<DenseIndex, 4> ix;
for (DenseIndex i = 0; i < sizeDim0; ++i) {
for (DenseIndex j = 0; j < sizeDim1; ++j) {
for (DenseIndex k = 0; k < sizeDim2; ++k) {
for (DenseIndex l = 0; l < sizeDim3; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
tensor(ix)=(ix[dim] != 0)?1.0:-10.0;
}
}
}
}
std::size_t in_bytes = tensor.size() * sizeof(DataType);
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
DataType * d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
gpu_out.device(sycl_device) = gpu_in.argmin(dim);
sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the first index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
}
sycl_device.synchronize();
for (DenseIndex i = 0; i < sizeDim0; ++i) {
for (DenseIndex j = 0; j < sizeDim1; ++j) {
for (DenseIndex k = 0; k < sizeDim2; ++k) {
for (DenseIndex l = 0; l < sizeDim3; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0;
}
}
}
}
sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
gpu_out.device(sycl_device) = gpu_in.argmin(dim);
sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
sycl_device.deallocate(d_in);
sycl_device.deallocate(d_out);
}
}
template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){
QueueInterface queueInterface(d);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
}
void test_cxx11_tensor_argmax_sycl() {
for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
}
}