From 35bae513a0094f986c810c3f839e5a954caabd4b Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 16 Dec 2016 19:46:45 +0000 Subject: [PATCH] Converting all parallel for lambda to functor in order to prevent kernel duplication name error; adding tensorConcatinationOp backend for sycl. --- .../CXX11/src/Tensor/TensorConcatenation.h | 6 + .../CXX11/src/Tensor/TensorContractionSycl.h | 312 ++++++++++-------- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 60 ++-- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 4 +- .../src/Tensor/TensorForwardDeclarations.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 5 +- .../CXX11/src/Tensor/TensorReductionSycl.h | 99 ++---- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 5 + .../src/Tensor/TensorSyclExtractFunctors.h | 26 +- .../CXX11/src/Tensor/TensorSyclFunctors.h | 86 ++++- .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 43 ++- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 6 + unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_concatenation_sycl.cpp | 180 ++++++++++ 16 files changed, 588 insertions(+), 264 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_concatenation_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 59bf90d93..2c7ba961c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -276,6 +276,12 @@ struct TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + /// required by sycl in order to extract the accessor + const Axis& axis() const { return m_axis; } protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 0cc97c59d..b170a1a5c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -190,16 +190,168 @@ LeftEvaluator m_leftImpl; RightEvaluator m_rightImpl; }; -template struct KernelNameConstructor; + +template struct KernelConstructor{ + + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + + FunctorExpr functors; + LhsLocalAcc localLhs; + RhsLocalAcc localRhs; + OutAccessor out_res; + Index roundUpK, M, N, K; + ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; + LeftNocontractT m_i_strides, m_left_nocontract_strides; + RightNocontractT m_j_strides, m_right_nocontract_strides; + TupleType tuple_of_accessors; + + KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, + ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, + LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_) + :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), + m_right_contracting_strides(m_right_contracting_strides_), + m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), + m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), + tuple_of_accessors(tuple_of_accessors_){} + + void operator()(cl::sycl::nd_item<1> itemID) { + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + auto device_evaluator = TensorEvaluatorContainer(device_expr.expr, Eigen::DefaultDevice()); + typedef TensorEvaluatorContainer DevEvaluator; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + // initialize data mappers must happen inside the kernel for device eval + LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); + // Matmul Kernel + // Thread identifiers + const int mLocalThreadId = itemID.get_local(0); // Local ID row + const int nLocalThreadId = itemID.get_local(1); // Local ID col + const int mGroupId = itemID.get_group(0); // Work-group ID row + const int nGroupId = itemID.get_group(1); // Work-group ID localCol + const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID + // Allocate register space + float privateLhs; + float privateRhs[WorkLoadPerThreadN]; + float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; + // Initialise the privateResumulation registers + for (int wLPTM=0; wLPTM(0); + } + // Tile Rhs + for (int lPTR=0; lPTR(0); + + } + // Loop over all tiles + const int numTiles = roundUpK/TileSizeDimK; + int firstHalf=0; + do { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // Load the next tile of Lhs and Rhs into local memory + int nextHalf = firstHalf + 1; + if (nextHalf < numTiles) { + // Tile A + for (int lPTL=0; lPTL(0); + } + // Tile B + for (int lPTR=0; lPTR(0); + } + } + // Loop over the values of a single tile + for (int k=0; k struct LaunchSyclKernels { static const int TileSizeDimM = 32; // Tile size for dimension M static const int TileSizeDimN = 32; // Tile size for dimension N -static const int TileSizeDimK = 16; // Tile size for dimension K +static const int TileSizeDimK = 16; // Tile size for dimension K static const int WorkLoadPerThreadM = 4; // Work load per thread in dimension M static const int WorkLoadPerThreadN = 4; // work load per thread in dimension N -static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) +static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) +static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression @@ -208,149 +360,39 @@ static int RoundUp(int x, int y) { return ((((x) + (y) - 1) / (y))*(y)); } -template< typename Self, typename Output, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> - static void Run(const Self& self, Output* buffer, Index M, Index N, Index K, +template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> + static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ // create a tuple of accessors from Evaluator - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - typedef KernelNameConstructor KernelName; + typedef typename Self::XprType HostExpr; + // typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + // typedef KernelNameConstructor KernelName; auto functors = Eigen::TensorSycl::internal::extractFunctors(self); + typedef decltype(functors) FunctorExpr; Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self); + typedef decltype(tuple_of_accessors) TupleType; // Local memory for elements of Lhs - cl::sycl::accessor localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); + typedef cl::sycl::accessor LhsLocalAcc; + LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); // Local memory for elements of Rhs - cl::sycl::accessor localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - //Output memory - auto out_privateRes= self.device(). template get_sycl_accessor(cgh, buffer); + typedef cl::sycl::accessor RhsLocalAcc; + RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); + //OutScalar memory + auto out_res= self.device(). template get_sycl_accessor(cgh, buffer); + typedef decltype(out_res) OutAccessor; // sycl parallel for - cgh.parallel_for( cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), [=](cl::sycl::nd_item<2> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = TensorEvaluatorContainer(device_expr.expr, Eigen::DefaultDevice()); - typedef TensorEvaluatorContainer DevEvaluator; - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); - auto out_ptr = ConvertToActualTypeSycl(Output, out_privateRes); - // Matmul Kernel - // Thread identifiers - const int mLocalThreadId = itemID.get_local(0); // Local ID row - const int nLocalThreadId = itemID.get_local(1); // Local ID col - const int mGroupId = itemID.get_group(0); // Work-group ID row - const int nGroupId = itemID.get_group(1); // Work-group ID localCol - const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID - // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; - // Initialise the privateResumulation registers - for (int wLPTM=0; wLPTM(0); - } - // Tile Rhs - for (int lPTR=0; lPTR(0); - - } - // Loop over all tiles - const int numTiles = roundUpK/TileSizeDimK; - int firstHalf=0; - do { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // Load the next tile of Lhs and Rhs into local memory - int nextHalf = firstHalf + 1; - if (nextHalf < numTiles) { - // Tile A - for (int lPTL=0; lPTL(0); - } - // Tile B - for (int lPTR=0; lPTR(0); - } - } - // Loop over the values of a single tile - for (int k=0; k(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), + cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), + KernelConstructor(functors, + localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors)); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 96c95e294..d444f3cd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -43,6 +43,18 @@ namespace Eigen { size_t m_offset; }; + struct memsetkernelFunctor{ + typedef cl::sycl::accessor AccType; + AccType m_acc; + const size_t m_rng, m_c; + memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + void operator()(cl::sycl::nd_item<1> itemID) { + auto globalid=itemID.get_global_linear_id(); + if (globalid< m_rng) m_acc[globalid] = m_c; + } + + }; + EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); @@ -88,15 +100,17 @@ struct QueueInterface { } } })) - #else - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - } - } - })) - #endif +#else +m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { + for (const auto& e : l) { + if (e) { + exception_caught_ = true; + std::cerr << "Error detected Inside Sycl Device."<< std::endl; + + } + } +})) +#endif {} /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. @@ -256,22 +270,26 @@ struct SyclDevice { /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. - template EIGEN_STRONG_INLINE void memset(T *data, int c, size_t n) const { + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto buf_acc =get_sycl_buffer(static_cast(static_cast(data))). template get_access(cgh); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< rng) { - for(size_t i=0; i(static_cast(data))),rng, GRange, tileSize, c )); asynchronousExec(); } + struct memsetCghFunctor{ + cl::sycl::buffer& m_buf; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access(cgh); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + } + }; + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME return 48*1024; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 06987132b..82dd1e640 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -41,6 +41,9 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; + typedef typename MakePointerT::RefType RefType; + + }; }; @@ -117,7 +120,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { return m_op; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index a68010c55..d6415817b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -69,7 +69,9 @@ struct TensorEvaluator return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::traits::template MakePointer::RefType + coeffRef(Index index) { eigen_assert(m_data); return m_data[index]; } @@ -95,7 +97,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::traits::template MakePointer::RefType + coeffRef(const array& coords) { eigen_assert(m_data); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index bbd5eb374..930837021 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -46,6 +46,8 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; + typedef typename MakePointerT::RefType RefType; + }; }; @@ -107,7 +109,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl + /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 499582a4c..e6aa0f334 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,16 +20,20 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; + typedef T& RefType; }; #if defined(EIGEN_USE_SYCL) namespace TensorSycl { namespace internal{ -template class ReductionFunctor; +template class ReductionFunctor; +template +struct FullReductionKernelFunctor; } } #endif + template class MakePointer_ = MakePointer> class TensorMap; template class Tensor; template class TensorFixedSize; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 75518a854..c841786b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -693,10 +693,11 @@ struct TensorEvaluator, #endif #if defined(EIGEN_USE_SYCL) - template < typename HostExpr_, typename PlaceHolderExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; - + template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; + template friend class TensorSycl::internal::FullReductionKernelFunctor; #endif + template friend struct internal::InnerReducer; // Returns the Index in the input tensor of the first value that needs to be diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index d5bc7b71b..c9912d9d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -25,8 +25,7 @@ namespace Eigen { namespace internal { - -template struct syclGenericBufferReducer{ +template struct syclGenericBufferReducer{ template static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { @@ -35,50 +34,16 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ - auto aI = - bufI.template get_access(h); - auto aOut = - bufOut.template get_access(h); - cl::sycl::accessor - scratch(cl::sycl::range<1>(local), h); + auto aI =bufI.template get_access(h); + auto aOut =bufOut.template get_access(h); + typedef decltype(aI) InputAccessor; + typedef decltype(aOut) OutputAccessor; + typedef cl::sycl::accessor LocalAccessor; + LocalAccessor scratch(cl::sycl::range<1>(local), h); /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for( - r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { - size_t globalid = id.get_global(0); - size_t localid = id.get_local(0); - /* All threads collectively read from global memory into local. - * The barrier ensures all threads' IO is resolved before - * execution continues (strictly speaking, all threads within - * a single work-group - there is no co-ordination between - * work-groups, only work-items). */ - if (globalid < length) { - scratch[localid] = aI[globalid]; - } - id.barrier(cl::sycl::access::fence_space::local_space); - - /* Apply the reduction operation between the current local - * id and the one on the other half of the vector. */ - if (globalid < length) { - auto min = (length < local) ? length : local; - for (size_t offset = min / 2; offset > 0; offset /= 2) { - if (localid < offset) { - scratch[localid] += scratch[localid + offset]; - } - id.barrier(cl::sycl::access::fence_space::local_space); - } - /* The final result will be stored in local id 0. */ - if (localid == 0) { - aI[id.get_group(0)] = scratch[localid]; - if((length<=local) && globalid ==0){ - auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; - } - } - } - }); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -96,11 +61,11 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de }; -/// For now let's start with a full reducer /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as // a leafNode. + template struct FullReducer { @@ -109,8 +74,8 @@ struct FullReducer { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); + typedef decltype(functors) FunctorExpr; int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input @@ -135,48 +100,29 @@ struct FullReducer { size_t outTileSize = tileSize; /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.get_sycl_buffer(output); - /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); typedef typename Eigen::internal::remove_all::type Dims; - Dims dims= self.xprDims(); - Op functor = reducer; + // Dims dims= self.xprDims(); + //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + typedef decltype(tuple_of_accessors) TupleType; auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); - - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - - if(globalid::reduce(device_self_evaluator, static_cast(red_factor*globalid), red_factor, const_cast(functor)); - else - tmp_global_accessor.get_pointer()[globalid]=static_cast(0); - - if(remaining!=0 && globalid==0 ) - // this will add the rest of input buffer when the input size is not devidable to red_factor. - tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer::reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(functor)); - }); + typedef decltype(tmp_global_accessor) OutAccessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), + TensorSycl::internal::FullReductionKernelFunctor + (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); }); dev.asynchronousExec(); -/// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); + // getting final out buffer at the moment the created buffer is true because there is no need for assign + auto out_buffer =dev.get_sycl_buffer(output); + /// This is used to recursively reduce the tmp value to an element of 1; + syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -190,7 +136,6 @@ struct InnerReducer { static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); typedef decltype(functors) FunctorExpr; typename Self::Index range, GRange, tileSize; @@ -208,7 +153,7 @@ struct InnerReducer { auto output_accessor = dev.template get_sycl_accessor(cgh, output); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - TensorSycl::internal::ReductionFunctor + TensorSycl::internal::ReductionFunctor (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); }); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index d7cbb420f..2e61ee049 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -20,12 +20,14 @@ template struct MakeGlobalPointer { typedef typename cl::sycl::global_ptr::pointer_t Type; + typedef typename cl::sycl::global_ptr::reference_t RefType; }; // global pointer to set different attribute state for a class template struct MakeLocalPointer { typedef typename cl::sycl::local_ptr::pointer_t Type; + typedef typename cl::sycl::local_ptr::reference_t RefType; }; @@ -33,6 +35,9 @@ namespace Eigen { namespace TensorSycl { namespace internal { + template struct GenericKernelReducer; + + /// This struct is used for special expression nodes with no operations (for example assign and selectOP). struct NoOP; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 4376a0e3c..6f9ab57af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -241,21 +241,25 @@ PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const) PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), ) #undef PADDINGOPFUNCEXT -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorContractionOp The LHS and RHS here are the original one no need to apply condition on their type. -#define SYCLEXTRFUNCCONTRACT(CVQual)\ -template \ -struct FunctorExtractor, Dev> > {\ +/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp +/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type. +#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\ +template \ +struct FunctorExtractor, Dev> > {\ FunctorExtractor > lhsExpr;\ FunctorExtractor > rhsExpr;\ - const Indices func;\ - FunctorExtractor(const TensorEvaluator, Dev>& expr)\ - : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.indices()) {}\ + const Param func;\ + FunctorExtractor(const TensorEvaluator, Dev>& expr)\ + : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ }; -SYCLEXTRFUNCCONTRACT(const) -SYCLEXTRFUNCCONTRACT() -#undef SYCLEXTRFUNCCONTRACT +// TensorContractionOp +SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const) +SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),) +// TensorConcatenationOp +SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) +SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) +#undef SYCLEXTRFUNCCONTRACTCONCAT /// template deduction function for FunctorExtractor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 56488d5d7..85c280588 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -18,9 +18,53 @@ namespace Eigen { namespace TensorSycl { namespace internal { + template struct GenericKernelReducer{ + OutputAccessor aOut; + InputAccessor aI; + LocalAccessor scratch; + size_t length, local; + GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + void operator()(cl::sycl::nd_item<1> itemID) { + size_t globalid = itemID.get_global(0); + size_t localid = itemID.get_local(0); + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + if (globalid < length) { + scratch[localid] = aI[globalid]; + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + if (globalid < length) { + auto min = (length < local) ? length : local; + for (size_t offset = min / 2; offset > 0; offset /= 2) { + if (localid < offset) { + scratch[localid] += scratch[localid + offset]; + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + /* The final result will be stored in local id 0. */ + if (localid == 0) { + aI[itemID.get_group(0)] = scratch[localid]; + if((length<=local) && globalid ==0){ + auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); + aOutPtr[0]=scratch[0]; + } + } + } + } + + }; + /// ReductionFunctor -template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { +template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; typedef cl::sycl::accessor write_accessor; ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_) :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} @@ -56,6 +100,46 @@ template < typename HostExpr, typename PlaceHolderExpr, typename FunctorExpr, ty }; +template +struct FullReductionKernelFunctor{ + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + OutAccessor tmp_global_accessor; + Index rng , remaining, red_factor; + Op op; + Dims dims; + FunctorExpr functors; + TupleType tuple_of_accessors; + + FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc) + :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} + + void operator()(cl::sycl::nd_item<1> itemID) { + + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, op); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + + if(globalid::reduce(device_self_evaluator, static_cast(red_factor*globalid), red_factor, const_cast(op)); + else + tmp_global_accessor.get_pointer()[globalid]=static_cast(0); + + if(remaining!=0 && globalid==0 ) + // this will add the rest of input buffer when the input size is not devidable to red_factor. + tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer::reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(op)); + } +}; + + + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index c941abf5c..5862c9795 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,6 +25,31 @@ namespace Eigen { namespace TensorSycl { + + + template struct ExecExprFunctorKernel{ + typedef typename internal::createPlaceHolderExpression::Type PlaceHolderExpr; + + typedef typename Expr::Index Index; + Index range; + FunctorExpr functors; + TupleType tuple_of_accessors; + ExecExprFunctorKernel(Index range_ + , + FunctorExpr functors_, TupleType tuple_of_accessors_ + ) + :range(range_) + , functors(functors_), tuple_of_accessors(tuple_of_accessors_) + {} + void operator()(cl::sycl::nd_item<1> itemID) { + typedef typename internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr =internal::createDeviceExpression(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + typename DevExpr::Index gId = static_cast(itemID.get_global_linear_id()); + if (gId < range) + device_evaluator.evalScalar(gId); + } + }; /// The run function in tensor sycl convert the expression tree to a buffer /// based expression tree; /// creates the expression tree for the device with accessor to buffers; @@ -34,25 +59,19 @@ void run(Expr &expr, Dev &dev) { Eigen::TensorEvaluator evaluator(expr, dev); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - typedef typename internal::createPlaceHolderExpression::Type PlaceHolderExpr; auto functors = internal::extractFunctors(evaluator); - + typedef decltype(functors) FunctorExpr; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = internal::createTupleOfAccessors(cgh, evaluator); + typedef decltype(tuple_of_accessors) TupleType; typename Expr::Index range, GRange, tileSize; dev.parallel_for_setup(static_cast(evaluator.dimensions().TotalSize()), tileSize, range, GRange); - // run the kernel - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { - typedef typename internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); - typename DevExpr::Index gId = static_cast(itemID.get_global_linear_id()); - if (gId < range) { - device_evaluator.evalScalar(gId); - } - }); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), + ExecExprFunctorKernel(range + , functors, tuple_of_accessors + )); }); dev.asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index ffcf8b00f..a1e944e59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -58,6 +58,8 @@ struct traits > }; template struct MakePointer { typedef T* Type; + typedef T& RefType; + }; }; @@ -76,6 +78,8 @@ struct traits > }; template struct MakePointer { typedef T* Type; + typedef T& RefType; + }; }; @@ -98,6 +102,8 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; + typedef typename MakePointerT::RefType RefType; + }; }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 0405ee9fa..daedb671c 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -151,6 +151,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp new file mode 100644 index 000000000..5a324b44c --- /dev/null +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -0,0 +1,180 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; + +template +static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) +{ + Index leftDim1 = 2; + Index leftDim2 = 3; + Index leftDim3 = 1; + Eigen::array leftRange = {{leftDim1, leftDim2, leftDim3}}; + Index rightDim1 = 2; + Index rightDim2 = 3; + Index rightDim3 = 1; + Eigen::array rightRange = {{rightDim1, rightDim2, rightDim3}}; + + //Index concatDim1 = 3; +// Index concatDim2 = 3; +// Index concatDim3 = 1; + //Eigen::array concatRange = {{concatDim1, concatDim2, concatDim3}}; + + Tensor left(leftRange); + Tensor right(rightRange); + left.setRandom(); + right.setRandom(); + + DataType * gpu_in1_data = static_cast(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); + + Eigen::TensorMap> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap> gpu_in2(gpu_in2_data, rightRange); + sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); + /// + Tensor concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); + DataType * gpu_out_data1 = static_cast(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap> gpu_out1(gpu_out_data1, concatenation1.dimensions()); + + //concatenation = left.concatenate(right, 0); + gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); + sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); + VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); + } + for (int i = 2; i < 4; ++i) { + VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); + } + } + + sycl_device.deallocate(gpu_out_data1); + Tensor concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); + DataType * gpu_out_data2 = static_cast(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap> gpu_out2(gpu_out_data2, concatenation2.dimensions()); + gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); + sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); + + //concatenation = left.concatenate(right, 1); + VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); + VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); + } + for (int j = 3; j < 6; ++j) { + VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); + } + } + sycl_device.deallocate(gpu_out_data2); + Tensor concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); + DataType * gpu_out_data3 = static_cast(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap> gpu_out3(gpu_out_data3, concatenation3.dimensions()); + gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); + sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); + + //concatenation = left.concatenate(right, 2); + VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); + VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); + } + } + sycl_device.deallocate(gpu_out_data3); + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); +} +template +static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + Index leftDim1 = 2; + Index leftDim2 = 3; + Eigen::array leftRange = {{leftDim1, leftDim2}}; + + Index rightDim1 = 2; + Index rightDim2 = 3; + Eigen::array rightRange = {{rightDim1, rightDim2}}; + + Index concatDim1 = 4; + Index concatDim2 = 3; + Eigen::array resRange = {{concatDim1, concatDim2}}; + + Tensor left(leftRange); + Tensor right(rightRange); + Tensor result(resRange); + + left.setRandom(); + right.setRandom(); + result.setRandom(); + + DataType * gpu_in1_data = static_cast(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); + + + Eigen::TensorMap> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap> gpu_out(gpu_out_data, resRange); + + sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + +// t1.concatenate(t2, 0) = result; + gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out; + sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(left(i, j), result(i, j)); + VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); + } + } + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_out_data); +} + + +template void tensorConcat_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_concatenation(sycl_device); + test_simple_concatenation(sycl_device); + test_concatenation_as_lvalue(sycl_device); +} +void test_cxx11_tensor_concatenation_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConcat_perDevice(device)); + } +}